From ff14c1d01576fb839a925a42596582f6c68a1a1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:16 +0200 Subject: x86, mm: Use MAX_DMA_PFN for ZONE_DMA on 32-bit Use MAX_DMA_PFN which represents the 16 MB ISA DMA limit on 32-bit x86 just like we do on 64-bit. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-1-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 29f7c6d98179..434c97d620c2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -679,8 +679,7 @@ static void __init zone_sizes_init(void) unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From 4c0b2e5f8940fec7cbeafcf641fecd5e746329c5 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:17 +0200 Subject: x86, mm: Move zone init from paging_init() on 64-bit This patch introduces a zone_sizes_init() helper function on 64-bit to make it more similar to 32-bit init. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-2-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bbaaa005bf0e..3ddda59f7087 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -612,7 +612,7 @@ void __init initmem_init(void) } #endif -void __init paging_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -623,6 +623,11 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; + free_area_init_nodes(max_zone_pfns); +} + +void __init paging_init(void) +{ sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); @@ -634,7 +639,7 @@ void __init paging_init(void) */ node_clear_state(0, N_NORMAL_MEMORY); - free_area_init_nodes(max_zone_pfns); + zone_sizes_init(); } /* -- cgit v1.2.3 From e4794640ca408acda18eb31b126f58a58803b9c9 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:18 +0200 Subject: x86, mm: Use max_pfn instead of highend_pfn The 'highend_pfn' variable is always set to 'max_pfn' so just use the latter directly. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-3-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 434c97d620c2..5ac0118b7610 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -683,7 +683,7 @@ static void __init zone_sizes_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif free_area_init_nodes(max_zone_pfns); -- cgit v1.2.3 From 80b3cac97bc14fdf839d967602e599cbf82ea336 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:19 +0200 Subject: x86, mm: Wrap ZONE_DMA32 with CONFIG_ZONE_DMA32 In preparation for unifying 32-bit and 64-bit zone_sizes_init() make sure ZONE_DMA32 is wrapped in CONFIG_ZONE_DMA32. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Acked-by: Arun Sharma Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-4-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3ddda59f7087..a9214e6e721a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -620,7 +620,9 @@ static void __init zone_sizes_init(void) #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif +#ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif max_zone_pfns[ZONE_NORMAL] = max_pfn; free_area_init_nodes(max_zone_pfns); -- cgit v1.2.3 From ece838b6257412647197c072fe59dfc6615df144 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:20 +0200 Subject: x86, mm: Use max_low_pfn for ZONE_NORMAL on 64-bit 64-bit has no highmem so max_low_pfn is always the same as 'max_pfn'. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-5-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a9214e6e721a..f6b1f087cced 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -623,7 +623,7 @@ static void __init zone_sizes_init(void) #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif - max_zone_pfns[ZONE_NORMAL] = max_pfn; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init_nodes(max_zone_pfns); } -- cgit v1.2.3 From 248b52b97da7a712d2263a51d8d84c959f38ef75 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:21 +0200 Subject: x86, mm: Prepare zone_sizes_init() for unification Make 32-bit and 64-bit zone_sizes_init() identical in preparation for unification. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-6-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/init_64.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5ac0118b7610..27455b958b8d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -677,9 +677,13 @@ void __init initmem_init(void) static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f6b1f087cced..06c4360cf796 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -624,6 +624,9 @@ static void __init zone_sizes_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif free_area_init_nodes(max_zone_pfns); } -- cgit v1.2.3 From 176239153049a023d060ce95b05f7ef31667e362 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:22 +0200 Subject: x86, mm: Unify zone_sizes_init() Now that zone_sizes_init() is identical on 32-bit and 64-bit, move the code to arch/x86/mm/init.c and use it for both architectures. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-7-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/init.h | 2 ++ arch/x86/mm/init.c | 23 +++++++++++++++++++++++ arch/x86/mm/init_32.c | 19 ------------------- arch/x86/mm/init_64.c | 19 ------------------- 4 files changed, 25 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 8dbe353e41e1..adcc0ae73d09 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,6 +5,8 @@ extern void __init early_ioremap_page_table_range_init(void); #endif +extern void __init zone_sizes_init(void); + extern unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 87488b93a65c..2426b60bb409 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,6 +3,7 @@ #include #include #include +#include /* for max_low_pfn */ #include #include @@ -15,6 +16,7 @@ #include #include #include +#include /* for MAX_DMA_PFN */ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; @@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); +} + diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27455b958b8d..3bebaed5021c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -674,25 +674,6 @@ void __init initmem_init(void) } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = max_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); -} - void __init setup_bootmem_allocator(void) { printk(KERN_INFO " mapped low ram: 0 - %08lx\n", diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06c4360cf796..6fcce7d34555 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -612,25 +612,6 @@ void __init initmem_init(void) } #endif -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = max_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); -} - void __init paging_init(void) { sparse_memory_present_with_active_regions(MAX_NUMNODES); -- cgit v1.2.3 From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:43:02 -0800 Subject: x86-64, syscall: Adjust comment spacing and remove typo Adjust spacing for comment so that it matches the multiline comment style used in the rest of the kernel, and remove word duplication. It is not really clear what version of gcc this refers to, but the extra & doesn't cause any harm, so there is no reason to remove it. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/syscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d6008295..0edfafa1b269 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -21,9 +21,9 @@ extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - *Smells like a like a compiler bug -- it doesn't work - *when the & below is removed. - */ + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; -- cgit v1.2.3 From e79a7fccfb2ab10f8753ac634a1c8473e870ae6c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:48:42 -0800 Subject: x86-64, ia32: Move compat_ni_syscall into C and its own file Move compat_ni_syscall out of ia32entry.S and into its own .c file. Although this is a trivial function, it is not performance-critical, and this will simplify further cleanups. Signed-off-by: H. Peter Anvin --- arch/x86/ia32/Makefile | 1 + arch/x86/ia32/ia32entry.S | 3 --- arch/x86/ia32/nosyscall.c | 7 +++++++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 arch/x86/ia32/nosyscall.c (limited to 'arch/x86') diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 52d0ccfcf6ea..eea9a1c77d38 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) += nosyscall.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a6253ec1b284..59538a777695 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -453,9 +453,6 @@ ia32_badsys: movq $-ENOSYS,%rax jmp ia32_sysret -quiet_ni_syscall: - movq $-ENOSYS,%rax - ret CFI_ENDPROC .macro PTREGSCALL label, func, arg diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c new file mode 100644 index 000000000000..51ecd5b4e787 --- /dev/null +++ b/arch/x86/ia32/nosyscall.c @@ -0,0 +1,7 @@ +#include +#include + +long compat_ni_syscall(void) +{ + return -ENOSYS; +} -- cgit v1.2.3 From d181764ccf6207e02abb95fb3052639b947f4833 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:55:49 -0800 Subject: x86: Machine-readable syscall tables and scripts to process them Create a simple set of syscall tables and scripts to turn them into both header files (unistd_*.h) and macros for generating the system call tables. Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/Makefile | 43 +++++ arch/x86/syscalls/syscall_32.tbl | 357 +++++++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscall_64.tbl | 320 +++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscallhdr.sh | 36 ++++ arch/x86/syscalls/syscalltbl.sh | 15 ++ 5 files changed, 771 insertions(+) create mode 100644 arch/x86/syscalls/Makefile create mode 100644 arch/x86/syscalls/syscall_32.tbl create mode 100644 arch/x86/syscalls/syscall_64.tbl create mode 100644 arch/x86/syscalls/syscallhdr.sh create mode 100644 arch/x86/syscalls/syscalltbl.sh (limited to 'arch/x86') diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile new file mode 100644 index 000000000000..564b2476fede --- /dev/null +++ b/arch/x86/syscalls/Makefile @@ -0,0 +1,43 @@ +out := $(obj)/../include/generated/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \ + $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget)) +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +syshdr_abi_unistd_32 := i386 +$(out)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := 64 +$(out)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +syshdr-y += unistd_32.h unistd_64.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h + +targets += $(syshdr-y) + +all: $(addprefix $(out)/,$(targets)) diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl new file mode 100644 index 000000000000..ce98e287c066 --- /dev/null +++ b/arch/x86/syscalls/syscall_32.tbl @@ -0,0 +1,357 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork ptregs_fork stub32_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid sys32_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve ptregs_execve stub32_execve +12 i386 chdir sys_chdir +13 i386 time sys_time compat_sys_time +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek sys32_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime compat_sys_stime +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill sys32_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction sys32_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend sys32_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap sys32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate +93 i386 ftruncate sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl ptregs_iopl stub32_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old ptregs_vm86old sys32_vm86_warning +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc sys32_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn ptregs_sigreturn stub32_sigreturn +120 i386 clone ptregs_clone stub32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex compat_sys_adjtimex +125 i386 mprotect sys_mprotect sys32_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl sys32_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep compat_sys_nanosleep +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 ptregs_vm86 sys32_vm86_warning +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend +180 i386 pread64 sys_pread64 sys32_pread +181 i386 pwrite64 sys_pwrite64 sys32_pwrite +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack +187 i386 sendfile sys_sendfile sys32_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork ptregs_vfork stub32_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 sys32_truncate64 +194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 +195 i386 stat64 sys_stat64 sys32_stat64 +196 i386 lstat64 sys_lstat64 sys32_lstat64 +197 i386 fstat64 sys_fstat64 sys32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_readahead sys32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents compat_sys_io_getevents +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie sys32_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 sys32_fstatat +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6 compat_sys_pselect6 +309 i386 ppoll sys_ppoll compat_sys_ppoll +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat compat_sys_utimensat +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_fallocate sys32_fallocate +325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark sys32_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl new file mode 100644 index 000000000000..b440a8f7eefa --- /dev/null +++ b/arch/x86/syscalls/syscall_64.tbl @@ -0,0 +1,320 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "64" for this file (for now.) +# +0 64 read sys_read +1 64 write sys_write +2 64 open sys_open +3 64 close sys_close +4 64 stat sys_newstat +5 64 fstat sys_newfstat +6 64 lstat sys_newlstat +7 64 poll sys_poll +8 64 lseek sys_lseek +9 64 mmap sys_mmap +10 64 mprotect sys_mprotect +11 64 munmap sys_munmap +12 64 brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 64 rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn stub_rt_sigreturn +16 64 ioctl sys_ioctl +17 64 pread64 sys_pread64 +18 64 pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 64 access sys_access +22 64 pipe sys_pipe +23 64 select sys_select +24 64 sched_yield sys_sched_yield +25 64 mremap sys_mremap +26 64 msync sys_msync +27 64 mincore sys_mincore +28 64 madvise sys_madvise +29 64 shmget sys_shmget +30 64 shmat sys_shmat +31 64 shmctl sys_shmctl +32 64 dup sys_dup +33 64 dup2 sys_dup2 +34 64 pause sys_pause +35 64 nanosleep sys_nanosleep +36 64 getitimer sys_getitimer +37 64 alarm sys_alarm +38 64 setitimer sys_setitimer +39 64 getpid sys_getpid +40 64 sendfile sys_sendfile64 +41 64 socket sys_socket +42 64 connect sys_connect +43 64 accept sys_accept +44 64 sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 64 shutdown sys_shutdown +49 64 bind sys_bind +50 64 listen sys_listen +51 64 getsockname sys_getsockname +52 64 getpeername sys_getpeername +53 64 socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 64 clone stub_clone +57 64 fork stub_fork +58 64 vfork stub_vfork +59 64 execve stub_execve +60 64 exit sys_exit +61 64 wait4 sys_wait4 +62 64 kill sys_kill +63 64 uname sys_newuname +64 64 semget sys_semget +65 64 semop sys_semop +66 64 semctl sys_semctl +67 64 shmdt sys_shmdt +68 64 msgget sys_msgget +69 64 msgsnd sys_msgsnd +70 64 msgrcv sys_msgrcv +71 64 msgctl sys_msgctl +72 64 fcntl sys_fcntl +73 64 flock sys_flock +74 64 fsync sys_fsync +75 64 fdatasync sys_fdatasync +76 64 truncate sys_truncate +77 64 ftruncate sys_ftruncate +78 64 getdents sys_getdents +79 64 getcwd sys_getcwd +80 64 chdir sys_chdir +81 64 fchdir sys_fchdir +82 64 rename sys_rename +83 64 mkdir sys_mkdir +84 64 rmdir sys_rmdir +85 64 creat sys_creat +86 64 link sys_link +87 64 unlink sys_unlink +88 64 symlink sys_symlink +89 64 readlink sys_readlink +90 64 chmod sys_chmod +91 64 fchmod sys_fchmod +92 64 chown sys_chown +93 64 fchown sys_fchown +94 64 lchown sys_lchown +95 64 umask sys_umask +96 64 gettimeofday sys_gettimeofday +97 64 getrlimit sys_getrlimit +98 64 getrusage sys_getrusage +99 64 sysinfo sys_sysinfo +100 64 times sys_times +101 64 ptrace sys_ptrace +102 64 getuid sys_getuid +103 64 syslog sys_syslog +104 64 getgid sys_getgid +105 64 setuid sys_setuid +106 64 setgid sys_setgid +107 64 geteuid sys_geteuid +108 64 getegid sys_getegid +109 64 setpgid sys_setpgid +110 64 getppid sys_getppid +111 64 getpgrp sys_getpgrp +112 64 setsid sys_setsid +113 64 setreuid sys_setreuid +114 64 setregid sys_setregid +115 64 getgroups sys_getgroups +116 64 setgroups sys_setgroups +117 64 setresuid sys_setresuid +118 64 getresuid sys_getresuid +119 64 setresgid sys_setresgid +120 64 getresgid sys_getresgid +121 64 getpgid sys_getpgid +122 64 setfsuid sys_setfsuid +123 64 setfsgid sys_setfsgid +124 64 getsid sys_getsid +125 64 capget sys_capget +126 64 capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 64 rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack stub_sigaltstack +132 64 utime sys_utime +133 64 mknod sys_mknod +134 64 uselib +135 64 personality sys_personality +136 64 ustat sys_ustat +137 64 statfs sys_statfs +138 64 fstatfs sys_fstatfs +139 64 sysfs sys_sysfs +140 64 getpriority sys_getpriority +141 64 setpriority sys_setpriority +142 64 sched_setparam sys_sched_setparam +143 64 sched_getparam sys_sched_getparam +144 64 sched_setscheduler sys_sched_setscheduler +145 64 sched_getscheduler sys_sched_getscheduler +146 64 sched_get_priority_max sys_sched_get_priority_max +147 64 sched_get_priority_min sys_sched_get_priority_min +148 64 sched_rr_get_interval sys_sched_rr_get_interval +149 64 mlock sys_mlock +150 64 munlock sys_munlock +151 64 mlockall sys_mlockall +152 64 munlockall sys_munlockall +153 64 vhangup sys_vhangup +154 64 modify_ldt sys_modify_ldt +155 64 pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 64 prctl sys_prctl +158 64 arch_prctl sys_arch_prctl +159 64 adjtimex sys_adjtimex +160 64 setrlimit sys_setrlimit +161 64 chroot sys_chroot +162 64 sync sys_sync +163 64 acct sys_acct +164 64 settimeofday sys_settimeofday +165 64 mount sys_mount +166 64 umount2 sys_umount +167 64 swapon sys_swapon +168 64 swapoff sys_swapoff +169 64 reboot sys_reboot +170 64 sethostname sys_sethostname +171 64 setdomainname sys_setdomainname +172 64 iopl stub_iopl +173 64 ioperm sys_ioperm +174 64 create_module +175 64 init_module sys_init_module +176 64 delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 64 quotactl sys_quotactl +180 64 nfsservctl +181 64 getpmsg +182 64 putpmsg +183 64 afs_syscall +184 64 tuxcall +185 64 security +186 64 gettid sys_gettid +187 64 readahead sys_readahead +188 64 setxattr sys_setxattr +189 64 lsetxattr sys_lsetxattr +190 64 fsetxattr sys_fsetxattr +191 64 getxattr sys_getxattr +192 64 lgetxattr sys_lgetxattr +193 64 fgetxattr sys_fgetxattr +194 64 listxattr sys_listxattr +195 64 llistxattr sys_llistxattr +196 64 flistxattr sys_flistxattr +197 64 removexattr sys_removexattr +198 64 lremovexattr sys_lremovexattr +199 64 fremovexattr sys_fremovexattr +200 64 tkill sys_tkill +201 64 time sys_time +202 64 futex sys_futex +203 64 sched_setaffinity sys_sched_setaffinity +204 64 sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 64 io_destroy sys_io_destroy +208 64 io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 64 io_cancel sys_io_cancel +211 64 get_thread_area +212 64 lookup_dcookie sys_lookup_dcookie +213 64 epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 64 remap_file_pages sys_remap_file_pages +217 64 getdents64 sys_getdents64 +218 64 set_tid_address sys_set_tid_address +219 64 restart_syscall sys_restart_syscall +220 64 semtimedop sys_semtimedop +221 64 fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 64 timer_settime sys_timer_settime +224 64 timer_gettime sys_timer_gettime +225 64 timer_getoverrun sys_timer_getoverrun +226 64 timer_delete sys_timer_delete +227 64 clock_settime sys_clock_settime +228 64 clock_gettime sys_clock_gettime +229 64 clock_getres sys_clock_getres +230 64 clock_nanosleep sys_clock_nanosleep +231 64 exit_group sys_exit_group +232 64 epoll_wait sys_epoll_wait +233 64 epoll_ctl sys_epoll_ctl +234 64 tgkill sys_tgkill +235 64 utimes sys_utimes +236 64 vserver +237 64 mbind sys_mbind +238 64 set_mempolicy sys_set_mempolicy +239 64 get_mempolicy sys_get_mempolicy +240 64 mq_open sys_mq_open +241 64 mq_unlink sys_mq_unlink +242 64 mq_timedsend sys_mq_timedsend +243 64 mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 64 mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 64 add_key sys_add_key +249 64 request_key sys_request_key +250 64 keyctl sys_keyctl +251 64 ioprio_set sys_ioprio_set +252 64 ioprio_get sys_ioprio_get +253 64 inotify_init sys_inotify_init +254 64 inotify_add_watch sys_inotify_add_watch +255 64 inotify_rm_watch sys_inotify_rm_watch +256 64 migrate_pages sys_migrate_pages +257 64 openat sys_openat +258 64 mkdirat sys_mkdirat +259 64 mknodat sys_mknodat +260 64 fchownat sys_fchownat +261 64 futimesat sys_futimesat +262 64 newfstatat sys_newfstatat +263 64 unlinkat sys_unlinkat +264 64 renameat sys_renameat +265 64 linkat sys_linkat +266 64 symlinkat sys_symlinkat +267 64 readlinkat sys_readlinkat +268 64 fchmodat sys_fchmodat +269 64 faccessat sys_faccessat +270 64 pselect6 sys_pselect6 +271 64 ppoll sys_ppoll +272 64 unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 64 splice sys_splice +276 64 tee sys_tee +277 64 sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 64 utimensat sys_utimensat +281 64 epoll_pwait sys_epoll_pwait +282 64 signalfd sys_signalfd +283 64 timerfd_create sys_timerfd_create +284 64 eventfd sys_eventfd +285 64 fallocate sys_fallocate +286 64 timerfd_settime sys_timerfd_settime +287 64 timerfd_gettime sys_timerfd_gettime +288 64 accept4 sys_accept4 +289 64 signalfd4 sys_signalfd4 +290 64 eventfd2 sys_eventfd2 +291 64 epoll_create1 sys_epoll_create1 +292 64 dup3 sys_dup3 +293 64 pipe2 sys_pipe2 +294 64 inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 64 perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 64 fanotify_init sys_fanotify_init +301 64 fanotify_mark sys_fanotify_mark +302 64 prlimit64 sys_prlimit64 +303 64 name_to_handle_at sys_name_to_handle_at +304 64 open_by_handle_at sys_open_by_handle_at +305 64 clock_adjtime sys_clock_adjtime +306 64 syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 64 setns sys_setns +309 64 getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh new file mode 100644 index 000000000000..0d473ff12eaf --- /dev/null +++ b/arch/x86/syscalls/syscallhdr.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +in="$1" +out="$2" +my_abis=`echo "$3" | tr ',' ' '` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` + +in_list () { + local x + for x in $1; do + if [ x"$x" = x"$2" ]; then + return 0 + fi + done + return 1 +} + +grep '^[0-9]' "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if in_list "$my_abis" "$abi"; then + echo "#define __NR_${prefix}${name}" $((nr+offset)) + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh new file mode 100644 index 000000000000..0e7f8ec071e7 --- /dev/null +++ b/arch/x86/syscalls/syscalltbl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +in="$1" +out="$2" + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi + done +) > "$out" -- cgit v1.2.3 From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 16:07:41 -0800 Subject: x86: Generate system call tables and unistd_*.h from tables Generate system call tables and unistd_*.h automatically from the tables in arch/x86/syscalls. All other information, like NR_syscalls, is auto-generated, some of which is in asm-offsets_*.c. This allows us to keep all the system call information in one place, and allows for kernel space and user space to see different information; this is currently used for the ia32 system call numbers when building the 64-bit kernel, but will be used by the x32 ABI in the near future. This also removes some gratuitious differences between i386, x86-64 and ia32; in particular, now all system call tables are generated with the same mechanism. Cc: H. J. Lu Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 6 + arch/x86/ia32/Makefile | 2 +- arch/x86/ia32/ia32entry.S | 356 ------------------ arch/x86/ia32/syscall_ia32.c | 25 ++ arch/x86/include/asm/Kbuild | 5 +- arch/x86/include/asm/ia32_unistd.h | 13 +- arch/x86/include/asm/unistd.h | 54 ++- arch/x86/include/asm/unistd_32.h | 401 -------------------- arch/x86/include/asm/unistd_64.h | 732 ------------------------------------- arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/asm-offsets_32.c | 8 + arch/x86/kernel/asm-offsets_64.c | 19 +- arch/x86/kernel/entry_32.S | 37 +- arch/x86/kernel/syscall_32.c | 25 ++ arch/x86/kernel/syscall_64.c | 14 +- arch/x86/kernel/syscall_table_32.S | 350 ------------------ 16 files changed, 154 insertions(+), 1896 deletions(-) create mode 100644 arch/x86/ia32/syscall_ia32.c delete mode 100644 arch/x86/include/asm/unistd_32.h delete mode 100644 arch/x86/include/asm/unistd_64.h create mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_table_32.S (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b02e509072a7..209ba1294592 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -117,6 +117,12 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +### +# Syscall table generation + +archheaders: + $(Q)$(MAKE) $(build)=arch/x86/syscalls all + ### # Kernel objects diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index eea9a1c77d38..455646e0e532 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += nosyscall.o +obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 59538a777695..72f853aea478 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -27,8 +27,6 @@ .section .entry.text, "ax" -#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) - .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp @@ -496,357 +494,3 @@ ENTRY(ia32_ptregs_common) jmp ia32_sysret /* misbalances the return cache */ CFI_ENDPROC END(ia32_ptregs_common) - - .section .rodata,"a" - .align 8 -ia32_sys_call_table: - .quad sys_restart_syscall - .quad sys_exit - .quad stub32_fork - .quad sys_read - .quad sys_write - .quad compat_sys_open /* 5 */ - .quad sys_close - .quad sys32_waitpid - .quad sys_creat - .quad sys_link - .quad sys_unlink /* 10 */ - .quad stub32_execve - .quad sys_chdir - .quad compat_sys_time - .quad sys_mknod - .quad sys_chmod /* 15 */ - .quad sys_lchown16 - .quad quiet_ni_syscall /* old break syscall holder */ - .quad sys_stat - .quad sys32_lseek - .quad sys_getpid /* 20 */ - .quad compat_sys_mount /* mount */ - .quad sys_oldumount /* old_umount */ - .quad sys_setuid16 - .quad sys_getuid16 - .quad compat_sys_stime /* stime */ /* 25 */ - .quad compat_sys_ptrace /* ptrace */ - .quad sys_alarm - .quad sys_fstat /* (old)fstat */ - .quad sys_pause - .quad compat_sys_utime /* 30 */ - .quad quiet_ni_syscall /* old stty syscall holder */ - .quad quiet_ni_syscall /* old gtty syscall holder */ - .quad sys_access - .quad sys_nice - .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ - .quad sys_sync - .quad sys32_kill - .quad sys_rename - .quad sys_mkdir - .quad sys_rmdir /* 40 */ - .quad sys_dup - .quad sys_pipe - .quad compat_sys_times - .quad quiet_ni_syscall /* old prof syscall holder */ - .quad sys_brk /* 45 */ - .quad sys_setgid16 - .quad sys_getgid16 - .quad sys_signal - .quad sys_geteuid16 - .quad sys_getegid16 /* 50 */ - .quad sys_acct - .quad sys_umount /* new_umount */ - .quad quiet_ni_syscall /* old lock syscall holder */ - .quad compat_sys_ioctl - .quad compat_sys_fcntl64 /* 55 */ - .quad quiet_ni_syscall /* old mpx syscall holder */ - .quad sys_setpgid - .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys_olduname - .quad sys_umask /* 60 */ - .quad sys_chroot - .quad compat_sys_ustat - .quad sys_dup2 - .quad sys_getppid - .quad sys_getpgrp /* 65 */ - .quad sys_setsid - .quad sys32_sigaction - .quad sys_sgetmask - .quad sys_ssetmask - .quad sys_setreuid16 /* 70 */ - .quad sys_setregid16 - .quad sys32_sigsuspend - .quad compat_sys_sigpending - .quad sys_sethostname - .quad compat_sys_setrlimit /* 75 */ - .quad compat_sys_old_getrlimit /* old_getrlimit */ - .quad compat_sys_getrusage - .quad compat_sys_gettimeofday - .quad compat_sys_settimeofday - .quad sys_getgroups16 /* 80 */ - .quad sys_setgroups16 - .quad compat_sys_old_select - .quad sys_symlink - .quad sys_lstat - .quad sys_readlink /* 85 */ - .quad sys_uselib - .quad sys_swapon - .quad sys_reboot - .quad compat_sys_old_readdir - .quad sys32_mmap /* 90 */ - .quad sys_munmap - .quad sys_truncate - .quad sys_ftruncate - .quad sys_fchmod - .quad sys_fchown16 /* 95 */ - .quad sys_getpriority - .quad sys_setpriority - .quad quiet_ni_syscall /* old profil syscall holder */ - .quad compat_sys_statfs - .quad compat_sys_fstatfs /* 100 */ - .quad sys_ioperm - .quad compat_sys_socketcall - .quad sys_syslog - .quad compat_sys_setitimer - .quad compat_sys_getitimer /* 105 */ - .quad compat_sys_newstat - .quad compat_sys_newlstat - .quad compat_sys_newfstat - .quad sys_uname - .quad stub32_iopl /* 110 */ - .quad sys_vhangup - .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ - .quad compat_sys_wait4 - .quad sys_swapoff /* 115 */ - .quad compat_sys_sysinfo - .quad sys32_ipc - .quad sys_fsync - .quad stub32_sigreturn - .quad stub32_clone /* 120 */ - .quad sys_setdomainname - .quad sys_newuname - .quad sys_modify_ldt - .quad compat_sys_adjtimex - .quad sys32_mprotect /* 125 */ - .quad compat_sys_sigprocmask - .quad quiet_ni_syscall /* create_module */ - .quad sys_init_module - .quad sys_delete_module - .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys32_quotactl - .quad sys_getpgid - .quad sys_fchdir - .quad quiet_ni_syscall /* bdflush */ - .quad sys_sysfs /* 135 */ - .quad sys_personality - .quad quiet_ni_syscall /* for afs_syscall */ - .quad sys_setfsuid16 - .quad sys_setfsgid16 - .quad sys_llseek /* 140 */ - .quad compat_sys_getdents - .quad compat_sys_select - .quad sys_flock - .quad sys_msync - .quad compat_sys_readv /* 145 */ - .quad compat_sys_writev - .quad sys_getsid - .quad sys_fdatasync - .quad compat_sys_sysctl /* sysctl */ - .quad sys_mlock /* 150 */ - .quad sys_munlock - .quad sys_mlockall - .quad sys_munlockall - .quad sys_sched_setparam - .quad sys_sched_getparam /* 155 */ - .quad sys_sched_setscheduler - .quad sys_sched_getscheduler - .quad sys_sched_yield - .quad sys_sched_get_priority_max - .quad sys_sched_get_priority_min /* 160 */ - .quad sys32_sched_rr_get_interval - .quad compat_sys_nanosleep - .quad sys_mremap - .quad sys_setresuid16 - .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ - .quad quiet_ni_syscall /* query_module */ - .quad sys_poll - .quad quiet_ni_syscall /* old nfsservctl */ - .quad sys_setresgid16 /* 170 */ - .quad sys_getresgid16 - .quad sys_prctl - .quad stub32_rt_sigreturn - .quad sys32_rt_sigaction - .quad sys32_rt_sigprocmask /* 175 */ - .quad sys32_rt_sigpending - .quad compat_sys_rt_sigtimedwait - .quad sys32_rt_sigqueueinfo - .quad sys_rt_sigsuspend - .quad sys32_pread /* 180 */ - .quad sys32_pwrite - .quad sys_chown16 - .quad sys_getcwd - .quad sys_capget - .quad sys_capset - .quad stub32_sigaltstack - .quad sys32_sendfile - .quad quiet_ni_syscall /* streams1 */ - .quad quiet_ni_syscall /* streams2 */ - .quad stub32_vfork /* 190 */ - .quad compat_sys_getrlimit - .quad sys_mmap_pgoff - .quad sys32_truncate64 - .quad sys32_ftruncate64 - .quad sys32_stat64 /* 195 */ - .quad sys32_lstat64 - .quad sys32_fstat64 - .quad sys_lchown - .quad sys_getuid - .quad sys_getgid /* 200 */ - .quad sys_geteuid - .quad sys_getegid - .quad sys_setreuid - .quad sys_setregid - .quad sys_getgroups /* 205 */ - .quad sys_setgroups - .quad sys_fchown - .quad sys_setresuid - .quad sys_getresuid - .quad sys_setresgid /* 210 */ - .quad sys_getresgid - .quad sys_chown - .quad sys_setuid - .quad sys_setgid - .quad sys_setfsuid /* 215 */ - .quad sys_setfsgid - .quad sys_pivot_root - .quad sys_mincore - .quad sys_madvise - .quad compat_sys_getdents64 /* 220 getdents64 */ - .quad compat_sys_fcntl64 - .quad quiet_ni_syscall /* tux */ - .quad quiet_ni_syscall /* security */ - .quad sys_gettid - .quad sys32_readahead /* 225 */ - .quad sys_setxattr - .quad sys_lsetxattr - .quad sys_fsetxattr - .quad sys_getxattr - .quad sys_lgetxattr /* 230 */ - .quad sys_fgetxattr - .quad sys_listxattr - .quad sys_llistxattr - .quad sys_flistxattr - .quad sys_removexattr /* 235 */ - .quad sys_lremovexattr - .quad sys_fremovexattr - .quad sys_tkill - .quad sys_sendfile64 - .quad compat_sys_futex /* 240 */ - .quad compat_sys_sched_setaffinity - .quad compat_sys_sched_getaffinity - .quad sys_set_thread_area - .quad sys_get_thread_area - .quad compat_sys_io_setup /* 245 */ - .quad sys_io_destroy - .quad compat_sys_io_getevents - .quad compat_sys_io_submit - .quad sys_io_cancel - .quad sys32_fadvise64 /* 250 */ - .quad quiet_ni_syscall /* free_huge_pages */ - .quad sys_exit_group - .quad sys32_lookup_dcookie - .quad sys_epoll_create - .quad sys_epoll_ctl /* 255 */ - .quad sys_epoll_wait - .quad sys_remap_file_pages - .quad sys_set_tid_address - .quad compat_sys_timer_create - .quad compat_sys_timer_settime /* 260 */ - .quad compat_sys_timer_gettime - .quad sys_timer_getoverrun - .quad sys_timer_delete - .quad compat_sys_clock_settime - .quad compat_sys_clock_gettime /* 265 */ - .quad compat_sys_clock_getres - .quad compat_sys_clock_nanosleep - .quad compat_sys_statfs64 - .quad compat_sys_fstatfs64 - .quad sys_tgkill /* 270 */ - .quad compat_sys_utimes - .quad sys32_fadvise64_64 - .quad quiet_ni_syscall /* sys_vserver */ - .quad sys_mbind - .quad compat_sys_get_mempolicy /* 275 */ - .quad sys_set_mempolicy - .quad compat_sys_mq_open - .quad sys_mq_unlink - .quad compat_sys_mq_timedsend - .quad compat_sys_mq_timedreceive /* 280 */ - .quad compat_sys_mq_notify - .quad compat_sys_mq_getsetattr - .quad compat_sys_kexec_load /* reserved for kexec */ - .quad compat_sys_waitid - .quad quiet_ni_syscall /* 285: sys_altroot */ - .quad sys_add_key - .quad sys_request_key - .quad sys_keyctl - .quad sys_ioprio_set - .quad sys_ioprio_get /* 290 */ - .quad sys_inotify_init - .quad sys_inotify_add_watch - .quad sys_inotify_rm_watch - .quad sys_migrate_pages - .quad compat_sys_openat /* 295 */ - .quad sys_mkdirat - .quad sys_mknodat - .quad sys_fchownat - .quad compat_sys_futimesat - .quad sys32_fstatat /* 300 */ - .quad sys_unlinkat - .quad sys_renameat - .quad sys_linkat - .quad sys_symlinkat - .quad sys_readlinkat /* 305 */ - .quad sys_fchmodat - .quad sys_faccessat - .quad compat_sys_pselect6 - .quad compat_sys_ppoll - .quad sys_unshare /* 310 */ - .quad compat_sys_set_robust_list - .quad compat_sys_get_robust_list - .quad sys_splice - .quad sys32_sync_file_range - .quad sys_tee /* 315 */ - .quad compat_sys_vmsplice - .quad compat_sys_move_pages - .quad sys_getcpu - .quad sys_epoll_pwait - .quad compat_sys_utimensat /* 320 */ - .quad compat_sys_signalfd - .quad sys_timerfd_create - .quad sys_eventfd - .quad sys32_fallocate - .quad compat_sys_timerfd_settime /* 325 */ - .quad compat_sys_timerfd_gettime - .quad compat_sys_signalfd4 - .quad sys_eventfd2 - .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ - .quad sys_pipe2 - .quad sys_inotify_init1 - .quad compat_sys_preadv - .quad compat_sys_pwritev - .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_event_open - .quad compat_sys_recvmmsg - .quad sys_fanotify_init - .quad sys32_fanotify_mark - .quad sys_prlimit64 /* 340 */ - .quad sys_name_to_handle_at - .quad compat_sys_open_by_handle_at - .quad compat_sys_clock_adjtime - .quad sys_syncfs - .quad compat_sys_sendmmsg /* 345 */ - .quad sys_setns - .quad compat_sys_process_vm_readv - .quad compat_sys_process_vm_writev -ia32_syscall_end: diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c new file mode 100644 index 000000000000..d04d3dbc47d4 --- /dev/null +++ b/arch/x86/ia32/syscall_ia32.c @@ -0,0 +1,25 @@ +/* System call table for ia32 emulation. */ + +#include +#include +#include +#include + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, + +typedef void (*sys_call_ptr_t)(void); + +extern void compat_ni_syscall(void); + +const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, +#include +}; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 6fa90a845e4c..b57e6a43a37a 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -19,7 +19,8 @@ header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h -header-y += unistd_32.h -header-y += unistd_64.h header-y += vm86.h header-y += vsyscall.h + +genhdr-y += unistd_32.h +genhdr-y += unistd_64.h diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h index 976f6ecd2ce6..b0d5716ca1e4 100644 --- a/arch/x86/include/asm/ia32_unistd.h +++ b/arch/x86/include/asm/ia32_unistd.h @@ -2,17 +2,10 @@ #define _ASM_X86_IA32_UNISTD_H /* - * This file contains the system call numbers of the ia32 port, + * This file contains the system call numbers of the ia32 compat ABI, * this is for the kernel only. - * Only add syscalls here where some part of the kernel needs to know - * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK */ - -#define __NR_ia32_restart_syscall 0 -#define __NR_ia32_exit 1 -#define __NR_ia32_read 3 -#define __NR_ia32_write 4 -#define __NR_ia32_sigreturn 119 -#define __NR_ia32_rt_sigreturn 173 +#define __SYSCALL_ia32_NR(x) (x) +#include #endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 2a58ed3e51d8..b4a3db7ce140 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,13 +1,59 @@ +#ifndef _ASM_X86_UNISTD_H +#define _ASM_X86_UNISTD_H 1 + #ifdef __KERNEL__ # ifdef CONFIG_X86_32 -# include "unistd_32.h" + +# include +# define __ARCH_WANT_IPC_PARSE_VERSION +# define __ARCH_WANT_STAT64 +# define __ARCH_WANT_SYS_OLD_MMAP +# define __ARCH_WANT_SYS_OLD_SELECT + # else -# include "unistd_64.h" + +# include +# define __ARCH_WANT_COMPAT_SYS_TIME + # endif + +# define __ARCH_WANT_OLD_READDIR +# define __ARCH_WANT_OLD_STAT +# define __ARCH_WANT_SYS_ALARM +# define __ARCH_WANT_SYS_FADVISE64 +# define __ARCH_WANT_SYS_GETHOSTNAME +# define __ARCH_WANT_SYS_GETPGRP +# define __ARCH_WANT_SYS_LLSEEK +# define __ARCH_WANT_SYS_NICE +# define __ARCH_WANT_SYS_OLDUMOUNT +# define __ARCH_WANT_SYS_OLD_GETRLIMIT +# define __ARCH_WANT_SYS_OLD_UNAME +# define __ARCH_WANT_SYS_PAUSE +# define __ARCH_WANT_SYS_RT_SIGACTION +# define __ARCH_WANT_SYS_RT_SIGSUSPEND +# define __ARCH_WANT_SYS_SGETMASK +# define __ARCH_WANT_SYS_SIGNAL +# define __ARCH_WANT_SYS_SIGPENDING +# define __ARCH_WANT_SYS_SIGPROCMASK +# define __ARCH_WANT_SYS_SOCKETCALL +# define __ARCH_WANT_SYS_TIME +# define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_SYS_WAITPID + +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") + #else # ifdef __i386__ -# include "unistd_32.h" +# include # else -# include "unistd_64.h" +# include # endif #endif + +#endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h deleted file mode 100644 index 599c77d38f33..000000000000 --- a/arch/x86/include/asm/unistd_32.h +++ /dev/null @@ -1,401 +0,0 @@ -#ifndef _ASM_X86_UNISTD_32_H -#define _ASM_X86_UNISTD_32_H - -/* - * This file contains the system call numbers. - */ - -#define __NR_restart_syscall 0 -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_waitpid 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_time 13 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lchown 16 -#define __NR_break 17 -#define __NR_oldstat 18 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_oldfstat 28 -#define __NR_pause 29 -#define __NR_utime 30 -#define __NR_stty 31 -#define __NR_gtty 32 -#define __NR_access 33 -#define __NR_nice 34 -#define __NR_ftime 35 -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 -#define __NR_prof 44 -#define __NR_brk 45 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_signal 48 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_acct 51 -#define __NR_umount2 52 -#define __NR_lock 53 -#define __NR_ioctl 54 -#define __NR_fcntl 55 -#define __NR_mpx 56 -#define __NR_setpgid 57 -#define __NR_ulimit 58 -#define __NR_oldolduname 59 -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sgetmask 68 -#define __NR_ssetmask 69 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_getgroups 80 -#define __NR_setgroups 81 -#define __NR_select 82 -#define __NR_symlink 83 -#define __NR_oldlstat 84 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_fchown 95 -#define __NR_getpriority 96 -#define __NR_setpriority 97 -#define __NR_profil 98 -#define __NR_statfs 99 -#define __NR_fstatfs 100 -#define __NR_ioperm 101 -#define __NR_socketcall 102 -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_olduname 109 -#define __NR_iopl 110 -#define __NR_vhangup 111 -#define __NR_idle 112 -#define __NR_vm86old 113 -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_modify_ldt 123 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 -#define __NR_create_module 127 -#define __NR_init_module 128 -#define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR_getdents 141 -#define __NR__newselect 142 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_vm86 166 -#define __NR_query_module 167 -#define __NR_poll 168 -#define __NR_nfsservctl 169 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#define __NR_prctl 172 -#define __NR_rt_sigreturn 173 -#define __NR_rt_sigaction 174 -#define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigtimedwait 177 -#define __NR_rt_sigqueueinfo 178 -#define __NR_rt_sigsuspend 179 -#define __NR_pread64 180 -#define __NR_pwrite64 181 -#define __NR_chown 182 -#define __NR_getcwd 183 -#define __NR_capget 184 -#define __NR_capset 185 -#define __NR_sigaltstack 186 -#define __NR_sendfile 187 -#define __NR_getpmsg 188 /* some people actually want streams */ -#define __NR_putpmsg 189 /* some people actually want streams */ -#define __NR_vfork 190 -#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 -#define __NR_getgroups32 205 -#define __NR_setgroups32 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#define __NR_pivot_root 217 -#define __NR_mincore 218 -#define __NR_madvise 219 -#define __NR_madvise1 219 /* delete when C lib stub is removed */ -#define __NR_getdents64 220 -#define __NR_fcntl64 221 -/* 223 is unused */ -#define __NR_gettid 224 -#define __NR_readahead 225 -#define __NR_setxattr 226 -#define __NR_lsetxattr 227 -#define __NR_fsetxattr 228 -#define __NR_getxattr 229 -#define __NR_lgetxattr 230 -#define __NR_fgetxattr 231 -#define __NR_listxattr 232 -#define __NR_llistxattr 233 -#define __NR_flistxattr 234 -#define __NR_removexattr 235 -#define __NR_lremovexattr 236 -#define __NR_fremovexattr 237 -#define __NR_tkill 238 -#define __NR_sendfile64 239 -#define __NR_futex 240 -#define __NR_sched_setaffinity 241 -#define __NR_sched_getaffinity 242 -#define __NR_set_thread_area 243 -#define __NR_get_thread_area 244 -#define __NR_io_setup 245 -#define __NR_io_destroy 246 -#define __NR_io_getevents 247 -#define __NR_io_submit 248 -#define __NR_io_cancel 249 -#define __NR_fadvise64 250 -/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */ -#define __NR_exit_group 252 -#define __NR_lookup_dcookie 253 -#define __NR_epoll_create 254 -#define __NR_epoll_ctl 255 -#define __NR_epoll_wait 256 -#define __NR_remap_file_pages 257 -#define __NR_set_tid_address 258 -#define __NR_timer_create 259 -#define __NR_timer_settime (__NR_timer_create+1) -#define __NR_timer_gettime (__NR_timer_create+2) -#define __NR_timer_getoverrun (__NR_timer_create+3) -#define __NR_timer_delete (__NR_timer_create+4) -#define __NR_clock_settime (__NR_timer_create+5) -#define __NR_clock_gettime (__NR_timer_create+6) -#define __NR_clock_getres (__NR_timer_create+7) -#define __NR_clock_nanosleep (__NR_timer_create+8) -#define __NR_statfs64 268 -#define __NR_fstatfs64 269 -#define __NR_tgkill 270 -#define __NR_utimes 271 -#define __NR_fadvise64_64 272 -#define __NR_vserver 273 -#define __NR_mbind 274 -#define __NR_get_mempolicy 275 -#define __NR_set_mempolicy 276 -#define __NR_mq_open 277 -#define __NR_mq_unlink (__NR_mq_open+1) -#define __NR_mq_timedsend (__NR_mq_open+2) -#define __NR_mq_timedreceive (__NR_mq_open+3) -#define __NR_mq_notify (__NR_mq_open+4) -#define __NR_mq_getsetattr (__NR_mq_open+5) -#define __NR_kexec_load 283 -#define __NR_waitid 284 -/* #define __NR_sys_setaltroot 285 */ -#define __NR_add_key 286 -#define __NR_request_key 287 -#define __NR_keyctl 288 -#define __NR_ioprio_set 289 -#define __NR_ioprio_get 290 -#define __NR_inotify_init 291 -#define __NR_inotify_add_watch 292 -#define __NR_inotify_rm_watch 293 -#define __NR_migrate_pages 294 -#define __NR_openat 295 -#define __NR_mkdirat 296 -#define __NR_mknodat 297 -#define __NR_fchownat 298 -#define __NR_futimesat 299 -#define __NR_fstatat64 300 -#define __NR_unlinkat 301 -#define __NR_renameat 302 -#define __NR_linkat 303 -#define __NR_symlinkat 304 -#define __NR_readlinkat 305 -#define __NR_fchmodat 306 -#define __NR_faccessat 307 -#define __NR_pselect6 308 -#define __NR_ppoll 309 -#define __NR_unshare 310 -#define __NR_set_robust_list 311 -#define __NR_get_robust_list 312 -#define __NR_splice 313 -#define __NR_sync_file_range 314 -#define __NR_tee 315 -#define __NR_vmsplice 316 -#define __NR_move_pages 317 -#define __NR_getcpu 318 -#define __NR_epoll_pwait 319 -#define __NR_utimensat 320 -#define __NR_signalfd 321 -#define __NR_timerfd_create 322 -#define __NR_eventfd 323 -#define __NR_fallocate 324 -#define __NR_timerfd_settime 325 -#define __NR_timerfd_gettime 326 -#define __NR_signalfd4 327 -#define __NR_eventfd2 328 -#define __NR_epoll_create1 329 -#define __NR_dup3 330 -#define __NR_pipe2 331 -#define __NR_inotify_init1 332 -#define __NR_preadv 333 -#define __NR_pwritev 334 -#define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_event_open 336 -#define __NR_recvmmsg 337 -#define __NR_fanotify_init 338 -#define __NR_fanotify_mark 339 -#define __NR_prlimit64 340 -#define __NR_name_to_handle_at 341 -#define __NR_open_by_handle_at 342 -#define __NR_clock_adjtime 343 -#define __NR_syncfs 344 -#define __NR_sendmmsg 345 -#define __NR_setns 346 -#define __NR_process_vm_readv 347 -#define __NR_process_vm_writev 348 - -#ifdef __KERNEL__ - -#define NR_syscalls 349 - -#define __ARCH_WANT_IPC_PARSE_VERSION -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_IPC -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLD_MMAP -#define __ARCH_WANT_SYS_OLD_SELECT -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#ifndef cond_syscall -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_UNISTD_32_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h deleted file mode 100644 index 0431f193c3f2..000000000000 --- a/arch/x86/include/asm/unistd_64.h +++ /dev/null @@ -1,732 +0,0 @@ -#ifndef _ASM_X86_UNISTD_64_H -#define _ASM_X86_UNISTD_64_H - -#ifndef __SYSCALL -#define __SYSCALL(a, b) -#endif - -/* - * This file contains the system call numbers. - * - * Note: holes are not allowed. - */ - -/* at least 8 syscall per cacheline */ -#define __NR_read 0 -__SYSCALL(__NR_read, sys_read) -#define __NR_write 1 -__SYSCALL(__NR_write, sys_write) -#define __NR_open 2 -__SYSCALL(__NR_open, sys_open) -#define __NR_close 3 -__SYSCALL(__NR_close, sys_close) -#define __NR_stat 4 -__SYSCALL(__NR_stat, sys_newstat) -#define __NR_fstat 5 -__SYSCALL(__NR_fstat, sys_newfstat) -#define __NR_lstat 6 -__SYSCALL(__NR_lstat, sys_newlstat) -#define __NR_poll 7 -__SYSCALL(__NR_poll, sys_poll) - -#define __NR_lseek 8 -__SYSCALL(__NR_lseek, sys_lseek) -#define __NR_mmap 9 -__SYSCALL(__NR_mmap, sys_mmap) -#define __NR_mprotect 10 -__SYSCALL(__NR_mprotect, sys_mprotect) -#define __NR_munmap 11 -__SYSCALL(__NR_munmap, sys_munmap) -#define __NR_brk 12 -__SYSCALL(__NR_brk, sys_brk) -#define __NR_rt_sigaction 13 -__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) -#define __NR_rt_sigprocmask 14 -__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) -#define __NR_rt_sigreturn 15 -__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn) - -#define __NR_ioctl 16 -__SYSCALL(__NR_ioctl, sys_ioctl) -#define __NR_pread64 17 -__SYSCALL(__NR_pread64, sys_pread64) -#define __NR_pwrite64 18 -__SYSCALL(__NR_pwrite64, sys_pwrite64) -#define __NR_readv 19 -__SYSCALL(__NR_readv, sys_readv) -#define __NR_writev 20 -__SYSCALL(__NR_writev, sys_writev) -#define __NR_access 21 -__SYSCALL(__NR_access, sys_access) -#define __NR_pipe 22 -__SYSCALL(__NR_pipe, sys_pipe) -#define __NR_select 23 -__SYSCALL(__NR_select, sys_select) - -#define __NR_sched_yield 24 -__SYSCALL(__NR_sched_yield, sys_sched_yield) -#define __NR_mremap 25 -__SYSCALL(__NR_mremap, sys_mremap) -#define __NR_msync 26 -__SYSCALL(__NR_msync, sys_msync) -#define __NR_mincore 27 -__SYSCALL(__NR_mincore, sys_mincore) -#define __NR_madvise 28 -__SYSCALL(__NR_madvise, sys_madvise) -#define __NR_shmget 29 -__SYSCALL(__NR_shmget, sys_shmget) -#define __NR_shmat 30 -__SYSCALL(__NR_shmat, sys_shmat) -#define __NR_shmctl 31 -__SYSCALL(__NR_shmctl, sys_shmctl) - -#define __NR_dup 32 -__SYSCALL(__NR_dup, sys_dup) -#define __NR_dup2 33 -__SYSCALL(__NR_dup2, sys_dup2) -#define __NR_pause 34 -__SYSCALL(__NR_pause, sys_pause) -#define __NR_nanosleep 35 -__SYSCALL(__NR_nanosleep, sys_nanosleep) -#define __NR_getitimer 36 -__SYSCALL(__NR_getitimer, sys_getitimer) -#define __NR_alarm 37 -__SYSCALL(__NR_alarm, sys_alarm) -#define __NR_setitimer 38 -__SYSCALL(__NR_setitimer, sys_setitimer) -#define __NR_getpid 39 -__SYSCALL(__NR_getpid, sys_getpid) - -#define __NR_sendfile 40 -__SYSCALL(__NR_sendfile, sys_sendfile64) -#define __NR_socket 41 -__SYSCALL(__NR_socket, sys_socket) -#define __NR_connect 42 -__SYSCALL(__NR_connect, sys_connect) -#define __NR_accept 43 -__SYSCALL(__NR_accept, sys_accept) -#define __NR_sendto 44 -__SYSCALL(__NR_sendto, sys_sendto) -#define __NR_recvfrom 45 -__SYSCALL(__NR_recvfrom, sys_recvfrom) -#define __NR_sendmsg 46 -__SYSCALL(__NR_sendmsg, sys_sendmsg) -#define __NR_recvmsg 47 -__SYSCALL(__NR_recvmsg, sys_recvmsg) - -#define __NR_shutdown 48 -__SYSCALL(__NR_shutdown, sys_shutdown) -#define __NR_bind 49 -__SYSCALL(__NR_bind, sys_bind) -#define __NR_listen 50 -__SYSCALL(__NR_listen, sys_listen) -#define __NR_getsockname 51 -__SYSCALL(__NR_getsockname, sys_getsockname) -#define __NR_getpeername 52 -__SYSCALL(__NR_getpeername, sys_getpeername) -#define __NR_socketpair 53 -__SYSCALL(__NR_socketpair, sys_socketpair) -#define __NR_setsockopt 54 -__SYSCALL(__NR_setsockopt, sys_setsockopt) -#define __NR_getsockopt 55 -__SYSCALL(__NR_getsockopt, sys_getsockopt) - -#define __NR_clone 56 -__SYSCALL(__NR_clone, stub_clone) -#define __NR_fork 57 -__SYSCALL(__NR_fork, stub_fork) -#define __NR_vfork 58 -__SYSCALL(__NR_vfork, stub_vfork) -#define __NR_execve 59 -__SYSCALL(__NR_execve, stub_execve) -#define __NR_exit 60 -__SYSCALL(__NR_exit, sys_exit) -#define __NR_wait4 61 -__SYSCALL(__NR_wait4, sys_wait4) -#define __NR_kill 62 -__SYSCALL(__NR_kill, sys_kill) -#define __NR_uname 63 -__SYSCALL(__NR_uname, sys_newuname) - -#define __NR_semget 64 -__SYSCALL(__NR_semget, sys_semget) -#define __NR_semop 65 -__SYSCALL(__NR_semop, sys_semop) -#define __NR_semctl 66 -__SYSCALL(__NR_semctl, sys_semctl) -#define __NR_shmdt 67 -__SYSCALL(__NR_shmdt, sys_shmdt) -#define __NR_msgget 68 -__SYSCALL(__NR_msgget, sys_msgget) -#define __NR_msgsnd 69 -__SYSCALL(__NR_msgsnd, sys_msgsnd) -#define __NR_msgrcv 70 -__SYSCALL(__NR_msgrcv, sys_msgrcv) -#define __NR_msgctl 71 -__SYSCALL(__NR_msgctl, sys_msgctl) - -#define __NR_fcntl 72 -__SYSCALL(__NR_fcntl, sys_fcntl) -#define __NR_flock 73 -__SYSCALL(__NR_flock, sys_flock) -#define __NR_fsync 74 -__SYSCALL(__NR_fsync, sys_fsync) -#define __NR_fdatasync 75 -__SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR_truncate 76 -__SYSCALL(__NR_truncate, sys_truncate) -#define __NR_ftruncate 77 -__SYSCALL(__NR_ftruncate, sys_ftruncate) -#define __NR_getdents 78 -__SYSCALL(__NR_getdents, sys_getdents) -#define __NR_getcwd 79 -__SYSCALL(__NR_getcwd, sys_getcwd) - -#define __NR_chdir 80 -__SYSCALL(__NR_chdir, sys_chdir) -#define __NR_fchdir 81 -__SYSCALL(__NR_fchdir, sys_fchdir) -#define __NR_rename 82 -__SYSCALL(__NR_rename, sys_rename) -#define __NR_mkdir 83 -__SYSCALL(__NR_mkdir, sys_mkdir) -#define __NR_rmdir 84 -__SYSCALL(__NR_rmdir, sys_rmdir) -#define __NR_creat 85 -__SYSCALL(__NR_creat, sys_creat) -#define __NR_link 86 -__SYSCALL(__NR_link, sys_link) -#define __NR_unlink 87 -__SYSCALL(__NR_unlink, sys_unlink) - -#define __NR_symlink 88 -__SYSCALL(__NR_symlink, sys_symlink) -#define __NR_readlink 89 -__SYSCALL(__NR_readlink, sys_readlink) -#define __NR_chmod 90 -__SYSCALL(__NR_chmod, sys_chmod) -#define __NR_fchmod 91 -__SYSCALL(__NR_fchmod, sys_fchmod) -#define __NR_chown 92 -__SYSCALL(__NR_chown, sys_chown) -#define __NR_fchown 93 -__SYSCALL(__NR_fchown, sys_fchown) -#define __NR_lchown 94 -__SYSCALL(__NR_lchown, sys_lchown) -#define __NR_umask 95 -__SYSCALL(__NR_umask, sys_umask) - -#define __NR_gettimeofday 96 -__SYSCALL(__NR_gettimeofday, sys_gettimeofday) -#define __NR_getrlimit 97 -__SYSCALL(__NR_getrlimit, sys_getrlimit) -#define __NR_getrusage 98 -__SYSCALL(__NR_getrusage, sys_getrusage) -#define __NR_sysinfo 99 -__SYSCALL(__NR_sysinfo, sys_sysinfo) -#define __NR_times 100 -__SYSCALL(__NR_times, sys_times) -#define __NR_ptrace 101 -__SYSCALL(__NR_ptrace, sys_ptrace) -#define __NR_getuid 102 -__SYSCALL(__NR_getuid, sys_getuid) -#define __NR_syslog 103 -__SYSCALL(__NR_syslog, sys_syslog) - -/* at the very end the stuff that never runs during the benchmarks */ -#define __NR_getgid 104 -__SYSCALL(__NR_getgid, sys_getgid) -#define __NR_setuid 105 -__SYSCALL(__NR_setuid, sys_setuid) -#define __NR_setgid 106 -__SYSCALL(__NR_setgid, sys_setgid) -#define __NR_geteuid 107 -__SYSCALL(__NR_geteuid, sys_geteuid) -#define __NR_getegid 108 -__SYSCALL(__NR_getegid, sys_getegid) -#define __NR_setpgid 109 -__SYSCALL(__NR_setpgid, sys_setpgid) -#define __NR_getppid 110 -__SYSCALL(__NR_getppid, sys_getppid) -#define __NR_getpgrp 111 -__SYSCALL(__NR_getpgrp, sys_getpgrp) - -#define __NR_setsid 112 -__SYSCALL(__NR_setsid, sys_setsid) -#define __NR_setreuid 113 -__SYSCALL(__NR_setreuid, sys_setreuid) -#define __NR_setregid 114 -__SYSCALL(__NR_setregid, sys_setregid) -#define __NR_getgroups 115 -__SYSCALL(__NR_getgroups, sys_getgroups) -#define __NR_setgroups 116 -__SYSCALL(__NR_setgroups, sys_setgroups) -#define __NR_setresuid 117 -__SYSCALL(__NR_setresuid, sys_setresuid) -#define __NR_getresuid 118 -__SYSCALL(__NR_getresuid, sys_getresuid) -#define __NR_setresgid 119 -__SYSCALL(__NR_setresgid, sys_setresgid) - -#define __NR_getresgid 120 -__SYSCALL(__NR_getresgid, sys_getresgid) -#define __NR_getpgid 121 -__SYSCALL(__NR_getpgid, sys_getpgid) -#define __NR_setfsuid 122 -__SYSCALL(__NR_setfsuid, sys_setfsuid) -#define __NR_setfsgid 123 -__SYSCALL(__NR_setfsgid, sys_setfsgid) -#define __NR_getsid 124 -__SYSCALL(__NR_getsid, sys_getsid) -#define __NR_capget 125 -__SYSCALL(__NR_capget, sys_capget) -#define __NR_capset 126 -__SYSCALL(__NR_capset, sys_capset) - -#define __NR_rt_sigpending 127 -__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) -#define __NR_rt_sigtimedwait 128 -__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) -#define __NR_rt_sigqueueinfo 129 -__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) -#define __NR_rt_sigsuspend 130 -__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) -#define __NR_sigaltstack 131 -__SYSCALL(__NR_sigaltstack, stub_sigaltstack) -#define __NR_utime 132 -__SYSCALL(__NR_utime, sys_utime) -#define __NR_mknod 133 -__SYSCALL(__NR_mknod, sys_mknod) - -/* Only needed for a.out */ -#define __NR_uselib 134 -__SYSCALL(__NR_uselib, sys_ni_syscall) -#define __NR_personality 135 -__SYSCALL(__NR_personality, sys_personality) - -#define __NR_ustat 136 -__SYSCALL(__NR_ustat, sys_ustat) -#define __NR_statfs 137 -__SYSCALL(__NR_statfs, sys_statfs) -#define __NR_fstatfs 138 -__SYSCALL(__NR_fstatfs, sys_fstatfs) -#define __NR_sysfs 139 -__SYSCALL(__NR_sysfs, sys_sysfs) - -#define __NR_getpriority 140 -__SYSCALL(__NR_getpriority, sys_getpriority) -#define __NR_setpriority 141 -__SYSCALL(__NR_setpriority, sys_setpriority) -#define __NR_sched_setparam 142 -__SYSCALL(__NR_sched_setparam, sys_sched_setparam) -#define __NR_sched_getparam 143 -__SYSCALL(__NR_sched_getparam, sys_sched_getparam) -#define __NR_sched_setscheduler 144 -__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler) -#define __NR_sched_getscheduler 145 -__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) -#define __NR_sched_get_priority_max 146 -__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) -#define __NR_sched_get_priority_min 147 -__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) -#define __NR_sched_rr_get_interval 148 -__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) - -#define __NR_mlock 149 -__SYSCALL(__NR_mlock, sys_mlock) -#define __NR_munlock 150 -__SYSCALL(__NR_munlock, sys_munlock) -#define __NR_mlockall 151 -__SYSCALL(__NR_mlockall, sys_mlockall) -#define __NR_munlockall 152 -__SYSCALL(__NR_munlockall, sys_munlockall) - -#define __NR_vhangup 153 -__SYSCALL(__NR_vhangup, sys_vhangup) - -#define __NR_modify_ldt 154 -__SYSCALL(__NR_modify_ldt, sys_modify_ldt) - -#define __NR_pivot_root 155 -__SYSCALL(__NR_pivot_root, sys_pivot_root) - -#define __NR__sysctl 156 -__SYSCALL(__NR__sysctl, sys_sysctl) - -#define __NR_prctl 157 -__SYSCALL(__NR_prctl, sys_prctl) -#define __NR_arch_prctl 158 -__SYSCALL(__NR_arch_prctl, sys_arch_prctl) - -#define __NR_adjtimex 159 -__SYSCALL(__NR_adjtimex, sys_adjtimex) - -#define __NR_setrlimit 160 -__SYSCALL(__NR_setrlimit, sys_setrlimit) - -#define __NR_chroot 161 -__SYSCALL(__NR_chroot, sys_chroot) - -#define __NR_sync 162 -__SYSCALL(__NR_sync, sys_sync) - -#define __NR_acct 163 -__SYSCALL(__NR_acct, sys_acct) - -#define __NR_settimeofday 164 -__SYSCALL(__NR_settimeofday, sys_settimeofday) - -#define __NR_mount 165 -__SYSCALL(__NR_mount, sys_mount) -#define __NR_umount2 166 -__SYSCALL(__NR_umount2, sys_umount) - -#define __NR_swapon 167 -__SYSCALL(__NR_swapon, sys_swapon) -#define __NR_swapoff 168 -__SYSCALL(__NR_swapoff, sys_swapoff) - -#define __NR_reboot 169 -__SYSCALL(__NR_reboot, sys_reboot) - -#define __NR_sethostname 170 -__SYSCALL(__NR_sethostname, sys_sethostname) -#define __NR_setdomainname 171 -__SYSCALL(__NR_setdomainname, sys_setdomainname) - -#define __NR_iopl 172 -__SYSCALL(__NR_iopl, stub_iopl) -#define __NR_ioperm 173 -__SYSCALL(__NR_ioperm, sys_ioperm) - -#define __NR_create_module 174 -__SYSCALL(__NR_create_module, sys_ni_syscall) -#define __NR_init_module 175 -__SYSCALL(__NR_init_module, sys_init_module) -#define __NR_delete_module 176 -__SYSCALL(__NR_delete_module, sys_delete_module) -#define __NR_get_kernel_syms 177 -__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall) -#define __NR_query_module 178 -__SYSCALL(__NR_query_module, sys_ni_syscall) - -#define __NR_quotactl 179 -__SYSCALL(__NR_quotactl, sys_quotactl) - -#define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_ni_syscall) - -/* reserved for LiS/STREAMS */ -#define __NR_getpmsg 181 -__SYSCALL(__NR_getpmsg, sys_ni_syscall) -#define __NR_putpmsg 182 -__SYSCALL(__NR_putpmsg, sys_ni_syscall) - -/* reserved for AFS */ -#define __NR_afs_syscall 183 -__SYSCALL(__NR_afs_syscall, sys_ni_syscall) - -/* reserved for tux */ -#define __NR_tuxcall 184 -__SYSCALL(__NR_tuxcall, sys_ni_syscall) - -#define __NR_security 185 -__SYSCALL(__NR_security, sys_ni_syscall) - -#define __NR_gettid 186 -__SYSCALL(__NR_gettid, sys_gettid) - -#define __NR_readahead 187 -__SYSCALL(__NR_readahead, sys_readahead) -#define __NR_setxattr 188 -__SYSCALL(__NR_setxattr, sys_setxattr) -#define __NR_lsetxattr 189 -__SYSCALL(__NR_lsetxattr, sys_lsetxattr) -#define __NR_fsetxattr 190 -__SYSCALL(__NR_fsetxattr, sys_fsetxattr) -#define __NR_getxattr 191 -__SYSCALL(__NR_getxattr, sys_getxattr) -#define __NR_lgetxattr 192 -__SYSCALL(__NR_lgetxattr, sys_lgetxattr) -#define __NR_fgetxattr 193 -__SYSCALL(__NR_fgetxattr, sys_fgetxattr) -#define __NR_listxattr 194 -__SYSCALL(__NR_listxattr, sys_listxattr) -#define __NR_llistxattr 195 -__SYSCALL(__NR_llistxattr, sys_llistxattr) -#define __NR_flistxattr 196 -__SYSCALL(__NR_flistxattr, sys_flistxattr) -#define __NR_removexattr 197 -__SYSCALL(__NR_removexattr, sys_removexattr) -#define __NR_lremovexattr 198 -__SYSCALL(__NR_lremovexattr, sys_lremovexattr) -#define __NR_fremovexattr 199 -__SYSCALL(__NR_fremovexattr, sys_fremovexattr) -#define __NR_tkill 200 -__SYSCALL(__NR_tkill, sys_tkill) -#define __NR_time 201 -__SYSCALL(__NR_time, sys_time) -#define __NR_futex 202 -__SYSCALL(__NR_futex, sys_futex) -#define __NR_sched_setaffinity 203 -__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) -#define __NR_sched_getaffinity 204 -__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) -#define __NR_set_thread_area 205 -__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_io_setup 206 -__SYSCALL(__NR_io_setup, sys_io_setup) -#define __NR_io_destroy 207 -__SYSCALL(__NR_io_destroy, sys_io_destroy) -#define __NR_io_getevents 208 -__SYSCALL(__NR_io_getevents, sys_io_getevents) -#define __NR_io_submit 209 -__SYSCALL(__NR_io_submit, sys_io_submit) -#define __NR_io_cancel 210 -__SYSCALL(__NR_io_cancel, sys_io_cancel) -#define __NR_get_thread_area 211 -__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_lookup_dcookie 212 -__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) -#define __NR_epoll_create 213 -__SYSCALL(__NR_epoll_create, sys_epoll_create) -#define __NR_epoll_ctl_old 214 -__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall) -#define __NR_epoll_wait_old 215 -__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall) -#define __NR_remap_file_pages 216 -__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) -#define __NR_getdents64 217 -__SYSCALL(__NR_getdents64, sys_getdents64) -#define __NR_set_tid_address 218 -__SYSCALL(__NR_set_tid_address, sys_set_tid_address) -#define __NR_restart_syscall 219 -__SYSCALL(__NR_restart_syscall, sys_restart_syscall) -#define __NR_semtimedop 220 -__SYSCALL(__NR_semtimedop, sys_semtimedop) -#define __NR_fadvise64 221 -__SYSCALL(__NR_fadvise64, sys_fadvise64) -#define __NR_timer_create 222 -__SYSCALL(__NR_timer_create, sys_timer_create) -#define __NR_timer_settime 223 -__SYSCALL(__NR_timer_settime, sys_timer_settime) -#define __NR_timer_gettime 224 -__SYSCALL(__NR_timer_gettime, sys_timer_gettime) -#define __NR_timer_getoverrun 225 -__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) -#define __NR_timer_delete 226 -__SYSCALL(__NR_timer_delete, sys_timer_delete) -#define __NR_clock_settime 227 -__SYSCALL(__NR_clock_settime, sys_clock_settime) -#define __NR_clock_gettime 228 -__SYSCALL(__NR_clock_gettime, sys_clock_gettime) -#define __NR_clock_getres 229 -__SYSCALL(__NR_clock_getres, sys_clock_getres) -#define __NR_clock_nanosleep 230 -__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) -#define __NR_exit_group 231 -__SYSCALL(__NR_exit_group, sys_exit_group) -#define __NR_epoll_wait 232 -__SYSCALL(__NR_epoll_wait, sys_epoll_wait) -#define __NR_epoll_ctl 233 -__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) -#define __NR_tgkill 234 -__SYSCALL(__NR_tgkill, sys_tgkill) -#define __NR_utimes 235 -__SYSCALL(__NR_utimes, sys_utimes) -#define __NR_vserver 236 -__SYSCALL(__NR_vserver, sys_ni_syscall) -#define __NR_mbind 237 -__SYSCALL(__NR_mbind, sys_mbind) -#define __NR_set_mempolicy 238 -__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) -#define __NR_get_mempolicy 239 -__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) -#define __NR_mq_open 240 -__SYSCALL(__NR_mq_open, sys_mq_open) -#define __NR_mq_unlink 241 -__SYSCALL(__NR_mq_unlink, sys_mq_unlink) -#define __NR_mq_timedsend 242 -__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) -#define __NR_mq_timedreceive 243 -__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) -#define __NR_mq_notify 244 -__SYSCALL(__NR_mq_notify, sys_mq_notify) -#define __NR_mq_getsetattr 245 -__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) -#define __NR_kexec_load 246 -__SYSCALL(__NR_kexec_load, sys_kexec_load) -#define __NR_waitid 247 -__SYSCALL(__NR_waitid, sys_waitid) -#define __NR_add_key 248 -__SYSCALL(__NR_add_key, sys_add_key) -#define __NR_request_key 249 -__SYSCALL(__NR_request_key, sys_request_key) -#define __NR_keyctl 250 -__SYSCALL(__NR_keyctl, sys_keyctl) -#define __NR_ioprio_set 251 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set) -#define __NR_ioprio_get 252 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get) -#define __NR_inotify_init 253 -__SYSCALL(__NR_inotify_init, sys_inotify_init) -#define __NR_inotify_add_watch 254 -__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) -#define __NR_inotify_rm_watch 255 -__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) -#define __NR_migrate_pages 256 -__SYSCALL(__NR_migrate_pages, sys_migrate_pages) -#define __NR_openat 257 -__SYSCALL(__NR_openat, sys_openat) -#define __NR_mkdirat 258 -__SYSCALL(__NR_mkdirat, sys_mkdirat) -#define __NR_mknodat 259 -__SYSCALL(__NR_mknodat, sys_mknodat) -#define __NR_fchownat 260 -__SYSCALL(__NR_fchownat, sys_fchownat) -#define __NR_futimesat 261 -__SYSCALL(__NR_futimesat, sys_futimesat) -#define __NR_newfstatat 262 -__SYSCALL(__NR_newfstatat, sys_newfstatat) -#define __NR_unlinkat 263 -__SYSCALL(__NR_unlinkat, sys_unlinkat) -#define __NR_renameat 264 -__SYSCALL(__NR_renameat, sys_renameat) -#define __NR_linkat 265 -__SYSCALL(__NR_linkat, sys_linkat) -#define __NR_symlinkat 266 -__SYSCALL(__NR_symlinkat, sys_symlinkat) -#define __NR_readlinkat 267 -__SYSCALL(__NR_readlinkat, sys_readlinkat) -#define __NR_fchmodat 268 -__SYSCALL(__NR_fchmodat, sys_fchmodat) -#define __NR_faccessat 269 -__SYSCALL(__NR_faccessat, sys_faccessat) -#define __NR_pselect6 270 -__SYSCALL(__NR_pselect6, sys_pselect6) -#define __NR_ppoll 271 -__SYSCALL(__NR_ppoll, sys_ppoll) -#define __NR_unshare 272 -__SYSCALL(__NR_unshare, sys_unshare) -#define __NR_set_robust_list 273 -__SYSCALL(__NR_set_robust_list, sys_set_robust_list) -#define __NR_get_robust_list 274 -__SYSCALL(__NR_get_robust_list, sys_get_robust_list) -#define __NR_splice 275 -__SYSCALL(__NR_splice, sys_splice) -#define __NR_tee 276 -__SYSCALL(__NR_tee, sys_tee) -#define __NR_sync_file_range 277 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) -#define __NR_vmsplice 278 -__SYSCALL(__NR_vmsplice, sys_vmsplice) -#define __NR_move_pages 279 -__SYSCALL(__NR_move_pages, sys_move_pages) -#define __NR_utimensat 280 -__SYSCALL(__NR_utimensat, sys_utimensat) -#define __NR_epoll_pwait 281 -__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) -#define __NR_signalfd 282 -__SYSCALL(__NR_signalfd, sys_signalfd) -#define __NR_timerfd_create 283 -__SYSCALL(__NR_timerfd_create, sys_timerfd_create) -#define __NR_eventfd 284 -__SYSCALL(__NR_eventfd, sys_eventfd) -#define __NR_fallocate 285 -__SYSCALL(__NR_fallocate, sys_fallocate) -#define __NR_timerfd_settime 286 -__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) -#define __NR_timerfd_gettime 287 -__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) -#define __NR_accept4 288 -__SYSCALL(__NR_accept4, sys_accept4) -#define __NR_signalfd4 289 -__SYSCALL(__NR_signalfd4, sys_signalfd4) -#define __NR_eventfd2 290 -__SYSCALL(__NR_eventfd2, sys_eventfd2) -#define __NR_epoll_create1 291 -__SYSCALL(__NR_epoll_create1, sys_epoll_create1) -#define __NR_dup3 292 -__SYSCALL(__NR_dup3, sys_dup3) -#define __NR_pipe2 293 -__SYSCALL(__NR_pipe2, sys_pipe2) -#define __NR_inotify_init1 294 -__SYSCALL(__NR_inotify_init1, sys_inotify_init1) -#define __NR_preadv 295 -__SYSCALL(__NR_preadv, sys_preadv) -#define __NR_pwritev 296 -__SYSCALL(__NR_pwritev, sys_pwritev) -#define __NR_rt_tgsigqueueinfo 297 -__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_event_open 298 -__SYSCALL(__NR_perf_event_open, sys_perf_event_open) -#define __NR_recvmmsg 299 -__SYSCALL(__NR_recvmmsg, sys_recvmmsg) -#define __NR_fanotify_init 300 -__SYSCALL(__NR_fanotify_init, sys_fanotify_init) -#define __NR_fanotify_mark 301 -__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) -#define __NR_prlimit64 302 -__SYSCALL(__NR_prlimit64, sys_prlimit64) -#define __NR_name_to_handle_at 303 -__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) -#define __NR_open_by_handle_at 304 -__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) -#define __NR_clock_adjtime 305 -__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) -#define __NR_syncfs 306 -__SYSCALL(__NR_syncfs, sys_syncfs) -#define __NR_sendmmsg 307 -__SYSCALL(__NR_sendmmsg, sys_sendmmsg) -#define __NR_setns 308 -__SYSCALL(__NR_setns, sys_setns) -#define __NR_getcpu 309 -__SYSCALL(__NR_getcpu, sys_getcpu) -#define __NR_process_vm_readv 310 -__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) -#define __NR_process_vm_writev 311 -__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) - -#ifndef __NO_STUBS -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_COMPAT_SYS_TIME -#endif /* __NO_STUBS */ - -#ifdef __KERNEL__ - -#ifndef COMPILE_OFFSETS -#include -#define NR_syscalls (__NR_syscall_max + 1) -#endif - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_UNISTD_64_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c4871c..8c473d9d0b83 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-y += syscall_$(BITS).o +obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e68067..85d98ab15cdc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@ #include #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -76,4 +81,7 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a1194af22..834e897b1e25 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@ #include -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include }; int main(void) @@ -72,7 +73,11 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f5344001..1ffcda22c2f6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -81,8 +81,6 @@ * enough to patch inline, increasing performance. */ -#define nr_syscalls ((syscall_table_size)/4) - #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -423,7 +421,7 @@ sysenter_past_esp: testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) @@ -504,7 +502,7 @@ ENTRY(system_call) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) @@ -650,7 +648,7 @@ syscall_trace_entry: movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) @@ -690,29 +688,28 @@ END(syscall_badsys) * System calls that need a pt_regs pointer. */ #define PTREGSCALL0(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL1(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL2(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%ecx; \ movl (PT_ECX+4)(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL3(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ CFI_STARTPROC; \ leal 4(%esp),%eax; \ pushl_cfi %eax; \ @@ -737,8 +734,7 @@ PTREGSCALL2(vm86) PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ - ALIGN; -ptregs_clone: +ENTRY(ptregs_clone) CFI_STARTPROC leal 4(%esp),%eax pushl_cfi %eax @@ -1209,11 +1205,6 @@ return_to_handler: jmp *%ecx #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) - /* * Some functions should be protected against kprobes */ diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 000000000000..b37a57336609 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 0edfafa1b269..7ac7943be02c 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@ #include #include -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, typedef void (*sys_call_ptr_t)(void); @@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include +#include }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e31293920..000000000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long ptregs_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long ptregs_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long ptregs_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long ptregs_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long ptregs_sigreturn - .long ptregs_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long ptregs_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_ni_syscall /* Old nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long ptregs_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long ptregs_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long ptregs_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap_pgoff - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_kexec_load - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get /* 290 */ - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat /* 295 */ - .long sys_mkdirat - .long sys_mknodat - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 /* 300 */ - .long sys_unlinkat - .long sys_renameat - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat /* 305 */ - .long sys_fchmodat - .long sys_faccessat - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ - .long sys_set_robust_list - .long sys_get_robust_list - .long sys_splice - .long sys_sync_file_range - .long sys_tee /* 315 */ - .long sys_vmsplice - .long sys_move_pages - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat /* 320 */ - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime /* 325 */ - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 /* 330 */ - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_event_open - .long sys_recvmmsg - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 /* 340 */ - .long sys_name_to_handle_at - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg /* 345 */ - .long sys_setns - .long sys_process_vm_readv - .long sys_process_vm_writev -- cgit v1.2.3 From f14525f9e033f344996905744f41680ea2b877ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 16:03:27 -0800 Subject: x86: Simplify syscallhdr.sh Simplify syscallhdr.sh by letting grep sort out the ABIs that we want, rather than relying on manual list matching. This is safe since the ABI strings already have to consist only of characters which are valid in C macro names. Suggested-by: Matt Helsley Link: http://lkml.kernel.org/r/20111118221558.GA6408@count0.beaverton.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscallhdr.sh | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh index 0d473ff12eaf..b3c593072785 100644 --- a/arch/x86/syscalls/syscallhdr.sh +++ b/arch/x86/syscalls/syscallhdr.sh @@ -2,33 +2,20 @@ in="$1" out="$2" -my_abis=`echo "$3" | tr ',' ' '` +my_abis=`echo "($3)" | tr ',' '|'` prefix="$4" offset="$5" fileguard=_ASM_X86_`basename "$out" | sed \ -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` - -in_list () { - local x - for x in $1; do - if [ x"$x" = x"$2" ]; then - return 0 - fi - done - return 1 -} - -grep '^[0-9]' "$in" | sort -n | ( +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "#ifndef ${fileguard}" echo "#define ${fileguard} 1" echo "" while read nr abi name entry ; do - if in_list "$my_abis" "$abi"; then - echo "#define __NR_${prefix}${name}" $((nr+offset)) - fi + echo "#define __NR_${prefix}${name}" $((nr+offset)) done echo "" -- cgit v1.2.3 From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 16:25:07 -0800 Subject: x86, syscall: Re-fix typo in comment Fix the same typo as was fixed in: b7641d2c x86-64, syscall: Adjust comment spacing and remove typo ... for the new versions of this file (32-bit and IA32 compat). Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com --- arch/x86/ia32/syscall_ia32.c | 2 +- arch/x86/kernel/syscall_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c index d04d3dbc47d4..4754ba0f5d9f 100644 --- a/arch/x86/ia32/syscall_ia32.c +++ b/arch/x86/ia32/syscall_ia32.c @@ -17,7 +17,7 @@ extern void compat_ni_syscall(void); const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* - * Smells like a like a compiler bug -- it doesn't work + * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index b37a57336609..147fcd4941c4 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - * Smells like a like a compiler bug -- it doesn't work + * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -- cgit v1.2.3 From 3f86886c72fb68088162c7e08cc7f85282f1860c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 17:01:19 -0800 Subject: x86, syscall: Allow syscall offset to be symbolic Allow the specified syscall offset to be symbolic, e.g. a macro. For offset system calls, this if nothing else makes the generated code easier to read. Suggested-by: H. J. Lu Link: http://lkml.kernel.org/r/1321569446-20433-7-git-send-email-hpa@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscallhdr.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh index b3c593072785..31fd5f1f38f7 100644 --- a/arch/x86/syscalls/syscallhdr.sh +++ b/arch/x86/syscalls/syscallhdr.sh @@ -15,7 +15,11 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "" while read nr abi name entry ; do - echo "#define __NR_${prefix}${name}" $((nr+offset)) + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi done echo "" -- cgit v1.2.3 From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:29 +0900 Subject: x86: Check stack overflow in detail Currently, only kernel stack is checked for the overflow, which is not sufficient for systems that need a high reliability. To enhance it, it is required to check the IRQ and exception stacks, as well. This patch checks all the stack types and will cause messages of stacks in detail when free stack space drops below a certain limit except user stack. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/Kconfig.debug | 7 +++++-- arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf56e1793272..4caec1261f12 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL ---help--- - This option will cause messages to be printed if free stack space - drops below a certain limit. + Say Y here if you want to check the overflows of kernel, IRQ + and exception stacks. This option will cause messages of the + stacks in detail when free stack space drops below a certain + limit. + If in doubt, say "N". config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 69bca468c47a..928a7e909619 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW + struct orig_ist *oist; + u64 irq_stack_top, irq_stack_bottom; + u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); if (user_mode_vm(regs)) return; - WARN_ONCE(regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128, + if (regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128) + return; + + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + return; + + oist = &__get_cpu_var(orig_ist); + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; - "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, + estack_top, estack_bottom); #endif } -- cgit v1.2.3 From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:36 +0900 Subject: x86: Panic on detection of stack overflow Currently, messages are just output on the detection of stack overflow, which is not sufficient for systems that need a high reliability. This is because in general the overflow may corrupt data, and the additional corruption may occur due to reading them unless systems stop. This patch adds the sysctl parameter kernel.panic_on_stackoverflow and causes a panic when detecting the overflows of kernel, IRQ and exception stacks except user stack according to the parameter. It is disabled by default. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 14 ++++++++++++++ arch/x86/kernel/irq_32.c | 2 ++ arch/x86/kernel/irq_64.c | 5 +++++ include/linux/kernel.h | 1 + kernel/sysctl.c | 9 +++++++++ 5 files changed, 31 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 1f2463671a1a..6d8cd8b2c30d 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -49,6 +49,7 @@ show up in /proc/sys/kernel: - panic - panic_on_oops - panic_on_unrecovered_nmi +- panic_on_stackoverflow - pid_max - powersave-nap [ PPC only ] - printk @@ -393,6 +394,19 @@ Controls the kernel's behaviour when an oops or BUG is encountered. ============================================================== +panic_on_stackoverflow: + +Controls the kernel's behavior when detecting the overflows of +kernel, IRQ and exception stacks except a user stack. +This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. + +0: try to continue operation. + +1: panic immediately. + +============================================================== + + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 72090705a656..e16e99ebd7ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -43,6 +43,8 @@ static void print_stack_overflow(void) { printk(KERN_WARNING "low stack detected by irq handler\n"); dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); } #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 928a7e909619..42552b0dce6a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); +int sysctl_panic_on_stackoverflow; + /* * Probabilistic stack overflow check: * @@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, estack_top, estack_bottom); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); #endif } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8b1597b5cf2..ff83683c0b9d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -341,6 +341,7 @@ extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; +extern int sysctl_panic_on_stackoverflow; extern const char *print_tainted(void); extern void add_taint(unsigned flag); extern int test_taint(unsigned flag); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae2719643854..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_DEBUG_STACKOVERFLOW + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "bootloader_type", .data = &bootloader_type, -- cgit v1.2.3 From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:45 +0900 Subject: x86: Clean up the range of stack overflow checking The overflow checking of kernel stack checks if the stack pointer points to the available kernel stack range, which is derived from the original overflow checking. It is clear that curbase address is always less than low boundary of available kernel stack. So, this patch removes the first condition that checks if the pointer is higher than curbase. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/irq_64.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 42552b0dce6a..54e2b2b2e250 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode_vm(regs)) return; - if (regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128) + if (regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128 && + regs->sp <= curbase + THREAD_SIZE) return; irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); -- cgit v1.2.3 From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:25 -0400 Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus A recent discussion started talking about the locking on the pstore fs and how it relates to the kmsg infrastructure. We noticed it was possible for userspace to r/w to the pstore fs (grabbing the locks in the process) and block the panic path from r/w to the same fs. The reason was the cpu with the lock could be doing work while the crashing cpu is panic'ing. Busting those spinlocks might cause those cpus to step on each other's data. Fine, fair enough. It was suggested it would be nice to serialize the panic path (ie stop the other cpus) and have only one cpu running. This would allow us to bust the spinlocks and not worry about another cpu stepping on the data. Of course, smp_send_stop() does this in the panic case. kmsg_dump() would have to be moved to be called after it. Easy enough. The only problem is on x86 the smp_send_stop() function calls the REBOOT_VECTOR. Any cpu with irqs disabled (which pstore and its backend ERST would do), block this IPI and thus do not stop. This makes it difficult to reliably log data to the pstore fs. The patch below switches from the REBOOT_VECTOR to NMI (and mimics what kdump does). Switching to NMI allows us to deliver the IPI when irqs are disabled, increasing the reliability of this function. However, Andi carefully noted that on some machines this approach does not work because of broken BIOSes or whatever. To help accomodate this, the next couple of patches will run a selftest and provide a knob to disable. V2: uses atomic ops to serialize the cpu that shuts everyone down V3: comment cleanup Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 16204dc15484..e72b1754a2d7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +static void native_nmi_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + return; + + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + return; + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_stop_other_cpus(int wait) +static void native_irq_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -230,7 +285,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_stop_other_cpus, + .stop_other_cpus = native_nmi_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, -- cgit v1.2.3 From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:26 -0400 Subject: x86, NMI: Add NMI IPI selftest The previous patch modified the stop cpus path to use NMI instead of IRQ as the way to communicate to the other cpus to shutdown. There were some concerns that various machines may have problems with using an NMI IPI. This patch creates a selftest to check if NMI is working at boot. The idea is to help catch any issues before the machine panics and we learn the hard way. Loosely based on the locking-selftest.c file, this separate file runs a couple of simple tests and reports the results. The output looks like: ... Brought up 4 CPUs ---------------- | NMI testsuite: -------------------- remote IPI: ok | local IPI: ok | -------------------- Good, all 2 testcases passed! | --------------------------------- Total of 4 processors activated (21330.61 BogoMIPS). ... Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 12 +++ arch/x86/include/asm/smp.h | 6 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + 5 files changed, 199 insertions(+) create mode 100644 arch/x86/kernel/nmi_selftest.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 4caec1261f12..97da3c17b424 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -287,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS If unsure, or if you run an older (pre 4.4) gcc, say N. +config DEBUG_NMI_SELFTEST + bool "NMI Selftest" + depends on DEBUG_KERNEL + ---help--- + Enabling this option turns on a quick NMI selftest to verify + that the NMI behaves correctly. + + This might help diagnose strange hangs that rely on NMI to + function properly. + + If unsure, say N. + endmenu diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 73b11bc0ae6f..0434c400287c 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_DEBUG_NMI_SELFTEST +extern void nmi_selftest(void); +#else +#define nmi_selftest() do { } while (0) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c4871c..02b2f05b371e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 000000000000..572adb622251 --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,179 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus + */ + +#include +#include +#include + +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; +static int unexpected_testcase_unknowns; + +static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +} + +static void cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest")) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && timeout--) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void reset_nmi(void) +{ + nmi_fail = 0; +} + +static void dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk("FAILED |"); + else if (nmi_fail == TIMEOUT) + printk("TIMEOUT|"); + else + printk("ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(" ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk("\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk("\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a958..19277817effa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done.\n"); + nmi_selftest(); impress_friends(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -- cgit v1.2.3 From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:27 -0400 Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus Some machines may exhibit problems using the NMI to stop other cpus. This knob just allows one to revert back to the original behaviour to help diagnose the problem. V2: make function static Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/kernel/smp.c | 13 +++++++++++++ 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a0c5c5f4fce6..b4339e5a50da 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1796,6 +1796,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nomfgpt [X86-32] Disable Multi-Function General Purpose Timer usage (for AMD Geode machines). + nonmi_ipi [X86] Disable using NMI IPIs during panic/reboot to + shutdown the other cpus. Instead use the REBOOT_VECTOR + irq. + nopat [X86] Disable PAT (page attribute table extension of pagetables) support. diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e72b1754a2d7..113acda5879e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait) local_irq_restore(flags); } +static void native_smp_disable_nmi_ipi(void) +{ + smp_ops.stop_other_cpus = native_irq_stop_other_cpus; +} + /* * Reschedule call back. */ @@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) irq_exit(); } +static int __init nonmi_ipi_setup(char *str) +{ + native_smp_disable_nmi_ipi(); + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, -- cgit v1.2.3 From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 5 Dec 2011 12:25:44 +0100 Subject: x86: Fix the 32-bit stackoverflow-debug build The panic_on_stackoverflow variable needs to be avilable on the 32-bit side as well ... Cc: Mitsuo Hayasaka Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e16e99ebd7ad..40fc86161d92 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); #ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) { -- cgit v1.2.3 From 54b0264ec8c6e90f0413ad30e2f91c65e7844613 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 16 Nov 2011 18:17:40 +0000 Subject: x86/sfi: Kill the IRQ as id hack Nothing should now need it so take it out Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index b1489a06a49d..6a21f603bd78 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -802,8 +802,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) if (mrst_has_msic()) return; - /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(entry->name, entry->irq); + pdev = platform_device_alloc(entry->name, 0); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", entry->name); -- cgit v1.2.3 From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 10 Nov 2011 13:29:14 +0000 Subject: x86/config: Revamp configuration for MID devices This follows on from the patch applied in 3.2rc1 which creates an INTEL_MID configuration. We can now add the entry for Medfield specific code. After this is merged the final patch will be submitted which moves the rest of the device Kconfig dependancies to MRST/MEDFIELD/INTEL_MID as appropriate. Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 17 +++++++++++++++++ arch/x86/Kconfig.debug | 6 +++--- arch/x86/kernel/early_printk.c | 2 +- arch/x86/platform/mrst/Makefile | 2 +- 4 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb9a1044a771..9e7a361423d6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -419,6 +419,23 @@ config X86_MRST nor standard legacy replacement devices/features. e.g. Moorestown does not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. +config X86_MDFLD + bool "Medfield MID platform" + depends on PCI + depends on PCI_GOANY + depends on X86_IO_APIC + select APB_TIMER + select I2C + select SPI + select INTEL_SCU_IPC + select X86_PLATFORM_DEVICES + ---help--- + Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. + Unlike standard x86 PCs, Medfield does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Medfield does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + endif config X86_RDC321X diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf56e1793272..28c3c73ab208 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,9 +43,9 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config EARLY_PRINTK_MRST - bool "Early printk for MRST platform support" - depends on EARLY_PRINTK && X86_MRST +config EARLY_PRINTK_INTEL_MID + bool "Early printk for Intel MID platform support" + depends on EARLY_PRINTK && X86_INTEL_MID config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a350f7f9..7a53da03086f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_MRST +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index 1ea38775a6d3..ddeec7300464 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_X86_MRST) += mrst.o obj-$(CONFIG_X86_MRST) += vrtc.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o +obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o obj-$(CONFIG_X86_MRST) += pmu.o -- cgit v1.2.3 From 9af0c7a6fa860698d080481f24a342ba74b68982 Mon Sep 17 00:00:00 2001 From: Ludwig Nussel Date: Tue, 15 Nov 2011 14:46:46 -0800 Subject: x86: Fix mmap random address range On x86_32 casting the unsigned int result of get_random_int() to long may result in a negative value. On x86_32 the range of mmap_rnd() therefore was -255 to 255. The 32bit mode on x86_64 used 0 to 255 as intended. The bug was introduced by 675a081 ("x86: unify mmap_{32|64}.c") in January 2008. Signed-off-by: Ludwig Nussel Cc: Linus Torvalds Cc: harvey.harrison@gmail.com Cc: "H. Peter Anvin" Cc: Harvey Harrison Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/201111152246.pAFMklOB028527@wpaz5.hot.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 4b5ba85eb5c9..845df6835f9f 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void) */ if (current->flags & PF_RANDOMIZE) { if (mmap_is_ia32()) - rnd = (long)get_random_int() % (1<<8); + rnd = get_random_int() % (1<<8); else - rnd = (long)(get_random_int() % (1<<28)); + rnd = get_random_int() % (1<<28); } return rnd << PAGE_SHIFT; } -- cgit v1.2.3 From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001 From: Mike Ditto Date: Tue, 15 Nov 2011 14:46:50 -0800 Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from sanitize_e820_map() Replace the bubble sort in sanitize_e820_map() with a call to the generic kernel sort function to avoid pathological performance with large maps. On large (thousands of entries) E820 maps, the previous code took minutes to run; with this change it's now milliseconds. Signed-off-by: Mike Ditto Cc: sassmann@kpanic.de Cc: yuenn@google.com Cc: Stefan Assmann Cc: Nancy Yuen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f076..f655f802260d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -227,22 +228,38 @@ void __init e820_print_map(char *who) * ____________________33__ * ______________________4_ */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map) { - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; static struct change_member change_point_list[2*E820_X_MAX] __initdata; static struct change_member *change_point[2*E820_X_MAX] __initdata; static struct e820entry *overlap_list[E820_X_MAX] __initdata; static struct e820entry new_bios[E820_X_MAX] __initdata; - struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; - int chgidx, still_changing; + int chgidx; int overlap_entries; int new_bios_entry; int old_nr, new_nr, chg_nr; @@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } + sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.2.3 From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 15 Nov 2011 14:48:56 -0800 Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as NULL pointer The last parameter to sort() is a pointer to the function used to swap items. This parameter should be NULL, not 0, when not used. This quiets the following sparse warning: warning: Using plain integer as NULL pointer Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Cc: hartleys@visionengravers.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f655f802260d..d6bd85352c81 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.2.3 From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 15 Nov 2011 15:33:56 -0800 Subject: x86: Reduce clock calibration time during slave cpu startup Reduce the startup time for slave cpus. Adds hooks for an arch-specific function for clock calibration. These hooks are used on x86. If a newly started cpu has the same phys_proc_id as a core already active, uses the TSC for the delay loop and has a CONSTANT_TSC, use the already-calculated value of loops_per_jiffy. This patch reduces the time required to start slave cpus on a 4096 cpu system from: 465 sec OLD 62 sec NEW This reduces boot time on a 4096p system by almost 7 minutes. Nice... Signed-off-by: Jack Steiner Cc: "H. Peter Anvin" Cc: John Stultz [fix CONFIG_SMP=n build] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 16 +++++++++++----- arch/x86/kernel/tsc.c | 20 ++++++++++++++++++++ init/calibrate.c | 15 +++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a958..00eef55c8327 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -207,22 +207,28 @@ static void __cpuinit smp_callin(void) * Need to setup vector mappings before we enable interrupts. */ setup_vector_irq(smp_processor_id()); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + /* * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. * * Need to enable IRQs because it can take longer and then * the NMI watchdog might kill us. */ local_irq_enable(); calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - /* * This must be done before setting cpu_online_mask * or calling notify_cpu_starting. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db483369f10b..490fb330be87 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -995,3 +995,23 @@ void __init tsc_init(void) check_system_tsc_reliable(); } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif diff --git a/init/calibrate.c b/init/calibrate.c index 24df7976816c..5f117ca9e069 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -246,6 +246,19 @@ recalibrate: static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 }; +/* + * Check if cpu calibration delay is already known. For example, + * some processors with multi-core sockets may have all cores + * with the same calibration delay. + * + * Architectures should override this function if a faster calibration + * method is available. + */ +unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void) +{ + return 0; +} + void __cpuinit calibrate_delay(void) { unsigned long lpj; @@ -265,6 +278,8 @@ void __cpuinit calibrate_delay(void) lpj = lpj_fine; pr_info("Calibrating delay loop (skipped), " "value calculated using timer frequency.. "); + } else if ((lpj = calibrate_delay_is_known())) { + ; } else if ((lpj = calibrate_delay_direct()) != 0) { if (!printed) pr_info("Calibrating delay using timer " -- cgit v1.2.3 From f9b15df466ba923a5832c9121ad8327ccf5483ef Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sat, 29 Oct 2011 00:48:42 +0200 Subject: x86/Kconfig: Cyclone-timer depends on x86-summit CONFIG_X86_CYCLONE_TIMER depends on CONFIG_X86_32_NON_STANDARD, which forces drivers/clocksource/cyclone.c to be compiled. The file doesn't do anything unless enabled by arch/x86/kernel/apic/summit_32.c Make CONFIG_X86_CYCLONE_TIMER depend by X86_SUMMIT instead, to avoid unnecessary code in other non-standard systems. Signed-off-by: Alessandro Rubini Cc: john stultz Link: http://lkml.kernel.org/r/20111028224842.GA7582@mail.gnudd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9e7a361423d6..faf39a0d6242 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -633,7 +633,7 @@ config X86_SUMMIT_NUMA config X86_CYCLONE_TIMER def_bool y - depends on X86_32_NON_STANDARD + depends on X86_SUMMIT source "arch/x86/Kconfig.cpu" -- cgit v1.2.3 From 45db1c6176c8171d9ae6fa6d82e07d115a5950ca Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Dec 2011 16:08:49 -0800 Subject: x86, um: Use the same style generated syscall tables as native Now when the native kernel uses a single style of generated system call table, follow suite for UML and implement the same style, all in C. This requires __NR_syscall_max and NR_syscalls to be generated; on native this is done in asm-headers.h but that file is common to all UML architectures; therefore put it in user-headers.h instead which already have accommodations for architecture-specific values. Signed-off-by: H. Peter Anvin --- arch/x86/um/Makefile | 3 ++- arch/x86/um/sys_call_table_32.S | 26 ------------------- arch/x86/um/sys_call_table_32.c | 55 +++++++++++++++++++++++++++++++++++++++++ arch/x86/um/sys_call_table_64.c | 31 +++++++++-------------- arch/x86/um/user-offsets.c | 15 +++++++++++ 5 files changed, 84 insertions(+), 46 deletions(-) delete mode 100644 arch/x86/um/sys_call_table_32.S create mode 100644 arch/x86/um/sys_call_table_32.c (limited to 'arch/x86') diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 8fb58400e415..5d065b2222d3 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o extra-y += user-offsets.s -$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) +$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \ + -Iarch/x86/include/generated UNPROFILE_OBJS := stub_segv.o CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S deleted file mode 100644 index a7ca80d2dceb..000000000000 --- a/arch/x86/um/sys_call_table_32.S +++ /dev/null @@ -1,26 +0,0 @@ -#include -/* Steal i386 syscall table for our purposes, but with some slight changes.*/ - -#define sys_iopl sys_ni_syscall -#define sys_ioperm sys_ni_syscall - -#define sys_vm86old sys_ni_syscall -#define sys_vm86 sys_ni_syscall - -#define old_mmap sys_old_mmap - -#define ptregs_fork sys_fork -#define ptregs_execve sys_execve -#define ptregs_iopl sys_iopl -#define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone -#define ptregs_vm86 sys_vm86 -#define ptregs_sigaltstack sys_sigaltstack -#define ptregs_vfork sys_vfork - -.section .rodata,"a" - -#include "../kernel/syscall_table_32.S" - -ENTRY(syscall_table_size) -.long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c new file mode 100644 index 000000000000..b897fcae6205 --- /dev/null +++ b/arch/x86/um/sys_call_table_32.c @@ -0,0 +1,55 @@ +/* + * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c + * with some changes for UML. + */ + +#include +#include +#include +#include + +#define __NO_STUBS + +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ + +/* Not going to be implemented by UML, since we have no hardware. */ +#define stub_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +#define sys_vm86old sys_ni_syscall +#define sys_vm86 sys_ni_syscall + +#define old_mmap sys_old_mmap + +#define ptregs_fork sys_fork +#define ptregs_execve sys_execve +#define ptregs_iopl sys_iopl +#define ptregs_vm86old sys_vm86old +#define ptregs_clone sys_clone +#define ptregs_vm86 sys_vm86 +#define ptregs_sigaltstack sys_sigaltstack +#define ptregs_vfork sys_vfork + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include + +#undef __SYSCALL_I386 +#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +sys_call_ptr_t sys_call_table[] __cacheline_aligned = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 99522f78b162..797a639bcca5 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -1,11 +1,12 @@ /* - * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c * with some changes for UML. */ #include #include #include +#include #define __NO_STUBS @@ -34,31 +35,23 @@ #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include "../../x86/include/asm/unistd_64.h" +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include -#undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, -#undef _ASM_X86_UNISTD_64_H +#undef __SYSCALL_64 +#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -/* - * We used to have a trick here which made sure that holes in the - * x86_64 table were filled in with sys_ni_syscall, but a comment in - * unistd_64.h says that holes aren't allowed, so the trick was - * removed. - * The trick looked like this - * [0 ... UM_NR_syscall_max] = &sys_ni_syscall - * before including unistd_64.h - the later initializations overwrote - * the sys_ni_syscall filler. - */ - sys_call_ptr_t sys_call_table[] __cacheline_aligned = { -#include + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include }; int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ca49be8ddd0c..5edf4f4bbf53 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -8,6 +8,18 @@ #include #include +#ifdef __i386__ +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; +#else +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; +#endif + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -77,4 +89,7 @@ void foo(void) DEFINE(UM_PROT_READ, PROT_READ); DEFINE(UM_PROT_WRITE, PROT_WRITE); DEFINE(UM_PROT_EXEC, PROT_EXEC); + + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } -- cgit v1.2.3 From a074335a370eca6d72f2ec890e4ae22923a2aea4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Dec 2011 22:48:49 -0800 Subject: x86, um: Mark system call tables readonly Mark the system call tables readonly, as they already are on native, and the 32-bit UM version was in the previous assembly version. The 32-bit version lost it due to copy and paste from the 64-bit version, which was missing the const. Cc: Jeff Dike Cc: Richard Weinberger Link: http://lkml.kernel.org/r/tip-45db1c6176c8171d9ae6fa6d82e07d115a5950ca@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index b897fcae6205..0606aa3e92ae 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -43,7 +43,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 797a639bcca5..fe626c3ba01b 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -45,7 +45,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. -- cgit v1.2.3 From 855c743a27bb58a9a521bdc485ef5acfdb69badc Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 6 Dec 2011 09:08:34 +0100 Subject: x86/mm: Initialize high mem before free_all_bootmem() Patch fixes a boot crash with pagealloc debugging enabled: Initializing HighMem for node 0 (000377fe:0003fff0) BUG: unable to handle kernel paging request at f6fefe80 IP: [] find_range_array+0x5e/0x69 [...] Call Trace: [] __get_free_all_memory_range+0x39/0xb4 [] add_highpages_with_active_regions+0x18/0x9b [] set_highmem_pages_init+0x70/0x90 [] mem_init+0x50/0x21b [] start_kernel+0x1bf/0x31c [] i386_start_kernel+0x65/0x67 The crash happens when memblock wants to allocate big area for temporary "struct range" array and reuses pages from top of low memory, which were already passed to the buddy allocator. Reported-by: Ingo Molnar Signed-off-by: Stanislaw Gruszka Cc: linux-mm@kvack.org Cc: Mel Gorman Link: http://lkml.kernel.org/r/20111206080833.GB3105@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3bebaed5021c..a2fecb1611cc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -744,6 +744,17 @@ void __init mem_init(void) #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before free_all_bootmem(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -755,8 +766,6 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(); - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; -- cgit v1.2.3 From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 29 Nov 2011 17:05:11 +0100 Subject: mm, x86: Remove debug_pagealloc_enabled When (no)bootmem finish operation, it pass pages to buddy allocator. Since debug_pagealloc_enabled is not set, we will do not protect pages, what is not what we want with CONFIG_DEBUG_PAGEALLOC=y. To fix remove debug_pagealloc_enabled. That variable was introduced by commit 12d6f21e "x86: do not PSE on CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page attribude) code testing. But currently we have CONFIG_CPA_DEBUG, which test CPA. Signed-off-by: Stanislaw Gruszka Acked-by: Mel Gorman Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 ------ include/linux/mm.h | 10 ---------- init/main.c | 5 ----- mm/debug-pagealloc.c | 3 --- 4 files changed, 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f9e526742fa1..5031eefa051f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1333,12 +1333,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable) numpages * PAGE_SIZE); } - /* - * If page allocator is not up yet then do not call c_p_a(): - */ - if (!debug_pagealloc_enabled) - return; - /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dc3a8c2c485..0a22db144753 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1537,23 +1537,13 @@ static inline void vm_stat_account(struct mm_struct *mm, #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC -extern int debug_pagealloc_enabled; - extern void kernel_map_pages(struct page *page, int numpages, int enable); - -static inline void enable_debug_pagealloc(void) -{ - debug_pagealloc_enabled = 1; -} #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} -static inline void enable_debug_pagealloc(void) -{ -} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ diff --git a/init/main.c b/init/main.c index 217ed23e9487..99c4ba30ba7e 100644 --- a/init/main.c +++ b/init/main.c @@ -282,10 +282,6 @@ static int __init unknown_bootoption(char *param, char *val) return 0; } -#ifdef CONFIG_DEBUG_PAGEALLOC -int __read_mostly debug_pagealloc_enabled = 0; -#endif - static int __init init_setup(char *str) { unsigned int i; @@ -597,7 +593,6 @@ asmlinkage void __init start_kernel(void) } #endif page_cgroup_init(); - enable_debug_pagealloc(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index 7cea557407f4..789ff70c8a4a 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -95,9 +95,6 @@ static void unpoison_pages(struct page *page, int n) void kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled) - return; - if (enable) unpoison_pages(page, numpages); else -- cgit v1.2.3 From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 6 Dec 2011 13:08:59 -0500 Subject: x86, NMI: NMI-selftest should handle the UP case properly If no remote cpus are online, then just quietly skip the remote IPI test for now. Signed-off-by: Don Zickus Cc: andi@firstfloor.org Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: robert.richter@amd.com Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 572adb622251..1e42a23c1f2a 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,8 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - test_nmi_ipi(to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(nmi_ipi_mask)) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } static void local_ipi(void) -- cgit v1.2.3 From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Wed, 7 Dec 2011 17:29:10 +0900 Subject: x86: Add stack top margin for stack overflow checking It seems that a margin for stack overflow checking is added to top of a kernel stack but is not added to IRQ and exception stacks in stack_overflow_check(). Therefore, the overflows of IRQ and exception stacks are always detected only after they actually occurred and data corruption might occur due to them. This patch adds the margin to top of IRQ and exception stacks as well as a kernel stack to enhance reliability. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp [ removed the #undef - we typically don't do that for uncommon names ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 54e2b2b2e250..d04d3ecded62 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow; static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN 128 struct orig_ist *oist; u64 irq_stack_top, irq_stack_bottom; u64 estack_top, estack_bottom; @@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128 && + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + STACK_TOP_MARGIN; irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; oist = &__get_cpu_var(orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; -- cgit v1.2.3 From 4f941c57fe7e04e38c2401d53516bfd16038c9ab Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 7 Dec 2011 16:06:30 -0500 Subject: x86, NMI: NMI selftest depends on the local apic The selftest doesn't work with out a local apic for now. Reported-by: Randy Durlap Signed-off-by: Don Zickus Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20111207210630.GI1669@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 97da3c17b424..aa4158f3ce62 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -289,7 +289,7 @@ config DEBUG_STRICT_USER_COPY_CHECKS config DEBUG_NMI_SELFTEST bool "NMI Selftest" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && X86_LOCAL_APIC ---help--- Enabling this option turns on a quick NMI selftest to verify that the NMI behaves correctly. -- cgit v1.2.3 From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 7 Dec 2011 14:06:12 +0300 Subject: x86, NMI: Add to_cpumask() to silence compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcc complains if we don't cast this to a struct cpumask pointer. arch/x86/kernel/nmi_selftest.c:93:2: warning: passing argument 1 of ‘cpumask_empty’ from incompatible pointer type [enabled by default] Signed-off-by: Dan Carpenter Cc: Don Zickus Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 1e42a23c1f2a..0d01a8ea4e11 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,7 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - if (!cpumask_empty(nmi_ipi_mask)) + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -- cgit v1.2.3 From 54eed6cb16ec315565aaaf8e34252ca253a68b7b Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Thu, 8 Dec 2011 13:16:41 +0100 Subject: x86/numa: Add constraints check for nid parameters This patch adds constraint checks to the numa_set_distance() function. When the check triggers (this should not happen normally) it emits a warning and avoids a store to a negative index in numa_distance[] array - i.e. avoids memory corruption. Negative ids can be passed when the pxm-to-nids mapping is not properly filled while parsing the SRAT. Signed-off-by: Petr Holasek Acked-by: David Rientjes Cc: Anton Arapov Link: http://lkml.kernel.org/r/20111208121640.GA2229@dhcp-27-244.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fbeaaf416610..cdc00543d375 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -430,8 +430,9 @@ static int __init numa_alloc_distance(void) * calls are ignored until the distance table is reset with * numa_reset_distance(). * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. * This is to allow simplification of specific NUMA config implementations. */ void __init numa_set_distance(int from, int to, int distance) @@ -439,8 +440,9 @@ void __init numa_set_distance(int from, int to, int distance) if (!numa_distance && numa_alloc_distance() < 0) return; - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", from, to, distance); return; } -- cgit v1.2.3 From 47db9e7c808a45b1f86971f25eca5e38fa95ab86 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 9 Dec 2011 11:13:59 -0800 Subject: x86, um: Fix typo in 32-bit system call modifications We override sys_iopl(), not stub_iopl(); the latter is a 64-bitism that doesn't apply to i386 in the first place. Reported-by: Richard Weinberger Signed-off-by: H. Peter Anvin --- arch/x86/um/sys_call_table_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 0606aa3e92ae..416bd40c0eba 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -16,7 +16,7 @@ */ /* Not going to be implemented by UML, since we have no hardware. */ -#define stub_iopl sys_ni_syscall +#define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall #define sys_vm86old sys_ni_syscall -- cgit v1.2.3 From 8af21e7e71d1ac56d9b66fb787a14fd66af7f5f7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sat, 27 Aug 2011 09:35:45 +0100 Subject: x86: Add missing bzImage fields to struct setup_header commit 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new bzImage fields") introduced some new fields into the bzImage header but struct setup_header was not updated accordingly. Add the missing 'pref_address' and 'init_size' fields. Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/bootparam.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index e020d88ec02d..2f90c51cc49d 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -64,6 +64,8 @@ struct setup_header { __u32 payload_offset; __u32 payload_length; __u64 setup_data; + __u64 pref_address; + __u32 init_size; } __attribute__((packed)); struct sys_desc_table { -- cgit v1.2.3 From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 15 Nov 2011 12:56:14 +0000 Subject: x86: Don't use magic strings for EFI loader signature Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic strings, which also helps to reduce the amount of ifdeffery. Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 4 ++++ arch/x86/kernel/setup.c | 7 +------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b8d8bfcd44a9..26d8c18d5faa 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,8 @@ #ifdef CONFIG_X86_32 +#define EFI_LOADER_SIGNATURE "EL32" + extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_phys0(f) efi_call_phys(f) @@ -35,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ +#define EFI_LOADER_SIGNATURE "EL64" + extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9a9e40fb091c..4d5243c31ac4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 - "EL32", -#else - "EL64", -#endif - 4)) { + EFI_LOADER_SIGNATURE, 4)) { efi_enabled = 1; efi_memblock_x86_reserve_range(); } -- cgit v1.2.3 From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 12 Dec 2011 21:27:52 +0000 Subject: x86, efi: EFI boot stub support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently a large divide between kernel development and the development of EFI boot loaders. The idea behind this patch is to give the kernel developers full control over the EFI boot process. As H. Peter Anvin put it, "The 'kernel carries its own stub' approach been very successful in dealing with BIOS, and would make a lot of sense to me for EFI as well." This patch introduces an EFI boot stub that allows an x86 bzImage to be loaded and executed by EFI firmware. The bzImage appears to the firmware as an EFI application. Luckily there are enough free bits within the bzImage header so that it can masquerade as an EFI application, thereby coercing the EFI firmware into loading it and jumping to its entry point. The beauty of this masquerading approach is that both BIOS and EFI boot loaders can still load and run the same bzImage, thereby allowing a single kernel image to work in any boot environment. The EFI boot stub supports multiple initrds, but they must exist on the same partition as the bzImage. Command-line arguments for the kernel can be appended after the bzImage name when run from the EFI shell, e.g. Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img v7: - Fix checkpatch warnings. v6: - Try to allocate initrd memory just below hdr->inird_addr_max. v5: - load_options_size is UTF-16, which needs dividing by 2 to convert to the corresponding ASCII size. v4: - Don't read more than image->load_options_size v3: - Fix following warnings when compiling CONFIG_EFI_STUB=n arch/x86/boot/tools/build.c: In function ‘main’: arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’ arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’ - As reported by Matthew Garrett, some Apple machines have GOPs that don't have hardware attached. We need to weed these out by searching for ones that handle the PCIIO protocol. - Don't allocate memory if no initrds are on cmdline - Don't trust image->load_options_size Maarten Lankhorst noted: - Don't strip first argument when booted from efibootmgr - Don't allocate too much memory for cmdline - Don't update cmdline_size, the kernel considers it read-only - Don't accept '\n' for initrd names v2: - File alignment was too large, was 8192 should be 512. Reported by Maarten Lankhorst on LKML. - Added UGA support for graphics - Use VIDEO_TYPE_EFI instead of hard-coded number. - Move linelength assignment until after we've assigned depth - Dynamically fill out AddressOfEntryPoint in tools/build.c - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen - The bzImage may need to be relocated as it may have been loaded at a high address address by the firmware. This was required to get my macbook booting because the firmware loaded it at 0x7cxxxxxx, which triggers this error in decompress_kernel(), if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) error("Destination address too large"); Cc: Mike Waychison Cc: Matthew Garrett Tested-by: Henrik Rydberg Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 7 + arch/x86/boot/compressed/Makefile | 10 +- arch/x86/boot/compressed/eboot.c | 1014 ++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/eboot.h | 60 ++ arch/x86/boot/compressed/efi_stub_32.S | 86 +++ arch/x86/boot/compressed/efi_stub_64.S | 1 + arch/x86/boot/compressed/head_32.S | 22 + arch/x86/boot/compressed/head_64.S | 20 + arch/x86/boot/compressed/string.c | 9 + arch/x86/boot/header.S | 158 +++++ arch/x86/boot/string.c | 35 ++ arch/x86/boot/tools/build.c | 39 ++ arch/x86/kernel/asm-offsets.c | 2 + 13 files changed, 1462 insertions(+), 1 deletion(-) create mode 100644 arch/x86/boot/compressed/eboot.c create mode 100644 arch/x86/boot/compressed/eboot.h create mode 100644 arch/x86/boot/compressed/efi_stub_32.S create mode 100644 arch/x86/boot/compressed/efi_stub_64.S (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efb42949cc09..d71b656bcb97 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1478,6 +1478,13 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. +config EFI_STUB + bool "EFI stub support" + depends on EFI + ---help--- + This kernel feature allows a bzImage to be loaded directly + by EFI firmware without the use of a bootloader. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 09664efb9cee..b123b9a8f5b3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE +VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ + $(obj)/piggy.o + +ifeq ($(CONFIG_EFI_STUB), y) + VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +endif + +$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c new file mode 100644 index 000000000000..4055e63d0b04 --- /dev/null +++ b/arch/x86/boot/compressed/eboot.c @@ -0,0 +1,1014 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include +#include +#include +#include + +#include "eboot.h" + +static efi_system_table_t *sys_table; + +static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, + unsigned long *desc_size) +{ + efi_memory_desc_t *m = NULL; + efi_status_t status; + unsigned long key; + u32 desc_version; + + *map_size = sizeof(*m) * 32; +again: + /* + * Add an additional efi_memory_desc_t because we're doing an + * allocation which may be in a new descriptor region. + */ + *map_size += sizeof(*m); + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, *map_size, (void **)&m); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size, + m, &key, desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, m); + goto again; + } + + if (status != EFI_SUCCESS) + efi_call_phys1(sys_table->boottime->free_pool, m); + +fail: + *map = m; + return status; +} + +/* + * Allocate at the highest possible address that is not above 'max'. + */ +static efi_status_t high_alloc(unsigned long size, unsigned long align, + unsigned long *addr, unsigned long max) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + u64 max_addr = 0; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; +again: + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + if ((start + size) > end || (start + size) > max) + continue; + + if (end - size > max) + end = max; + + if (round_down(end - size, align) < start) + continue; + + start = round_down(end - size, align); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. + */ + if (start == 0x0) + continue; + + if (start > max_addr) + max_addr = start; + } + + if (!max_addr) + status = EFI_NOT_FOUND; + else { + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &max_addr); + if (status != EFI_SUCCESS) { + max = max_addr; + max_addr = 0; + goto again; + } + + *addr = max_addr; + } + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); + +fail: + return status; +} + +/* + * Allocate at the lowest possible address. + */ +static efi_status_t low_alloc(unsigned long size, unsigned long align, + unsigned long *addr) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. Skip the first 8 + * bytes so we start at a nice even number. + */ + if (start == 0x0) + start += 8; + + start = round_up(start, align); + if ((start + size) > end) + continue; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status == EFI_SUCCESS) { + *addr = start; + break; + } + } + + if (i == map_size / desc_size) + status = EFI_NOT_FOUND; + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); +fail: + return status; +} + +static void low_free(unsigned long size, unsigned long addr) +{ + unsigned long nr_pages; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + efi_call_phys2(sys_table->boottime->free_pages, addr, size); +} + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +/* + * See if we have Graphics Output Protocol + */ +static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + struct efi_graphics_output_protocol *gop, *first_gop; + struct efi_pixel_bitmask pixel_info; + unsigned long nr_gops; + efi_status_t status; + void **gop_handle; + u16 width, height; + u32 fb_base, fb_size; + u32 pixels_per_scan_line; + int pixel_format; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, proto, + NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_gop = NULL; + + nr_gops = size / sizeof(void *); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info; + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *pciio; + void *h = gop_handle[i]; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, proto, &gop); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + h, &pciio_proto, &pciio); + + status = efi_call_phys4(gop->query_mode, gop, + gop->mode->mode, &size, &info); + if (status == EFI_SUCCESS && (!first_gop || pciio)) { + /* + * Apple provide GOPs that are not backed by + * real hardware (they're used to handle + * multiple displays). The workaround is to + * search for a GOP implementing the PCIIO + * protocol, and if one isn't found, to just + * fallback to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + fb_base = gop->mode->frame_buffer_base; + fb_size = gop->mode->frame_buffer_size; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + + /* + * Once we've found a GOP supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_gop = gop; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + si->lfb_size = fb_size; + si->pages = 1; + + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, gop_handle); + return status; +} + +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ +static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, + unsigned long size) +{ + struct efi_uga_draw_protocol *uga, *first_uga; + unsigned long nr_ugas; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &uga_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, uga_proto, + NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_uga = NULL; + + nr_ugas = size / sizeof(void *); + for (i = 0; i < nr_ugas; i++) { + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *handle = uga_handle[i]; + u32 w, h, depth, refresh; + void *pciio; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, uga_proto, &uga); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &pciio_proto, &pciio); + + status = efi_call_phys5(uga->get_mode, uga, &w, &h, + &depth, &refresh); + if (status == EFI_SUCCESS && (!first_uga || pciio)) { + width = w; + height = h; + + /* + * Once we've found a UGA supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_uga = uga; + } + } + + if (!first_uga) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; + + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, uga_handle); + return status; +} + +void setup_graphics(struct boot_params *boot_params) +{ + efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + struct screen_info *si; + efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + void **uga_handle = NULL; + + si = &boot_params->screen_info; + memset(si, 0, sizeof(*si)); + + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &graphics_proto, + NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) + status = setup_gop(si, &graphics_proto, size); + + if (status != EFI_SUCCESS) { + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &uga_proto, + NULL, &size, uga_handle); + if (status == EFI_BUFFER_TOO_SMALL) + setup_uga(si, &uga_proto, size); + } +} + +struct initrd { + efi_file_handle_t *handle; + u64 size; +}; + +/* + * Check the cmdline for a LILO-style initrd= arguments. + * + * We only support loading an initrd from the same filesystem as the + * kernel image. + */ +static efi_status_t handle_ramdisks(efi_loaded_image_t *image, + struct setup_header *hdr) +{ + struct initrd *initrds; + unsigned long initrd_addr; + efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; + u64 initrd_total; + efi_file_io_interface_t *io; + efi_file_handle_t *fh; + efi_status_t status; + int nr_initrds; + char *str; + int i, j, k; + + initrd_addr = 0; + initrd_total = 0; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + + j = 0; /* See close_handles */ + + if (!str || !*str) + return EFI_SUCCESS; + + for (nr_initrds = 0; *str; nr_initrds++) { + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') + str++; + } + + if (!nr_initrds) + return EFI_SUCCESS; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, + nr_initrds * sizeof(*initrds), + &initrds); + if (status != EFI_SUCCESS) + goto fail; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + for (i = 0; i < nr_initrds; i++) { + struct initrd *initrd; + efi_file_handle_t *h; + efi_file_info_t *info; + efi_char16_t filename[256]; + unsigned long info_sz; + efi_guid_t info_guid = EFI_FILE_INFO_ID; + efi_char16_t *p; + u64 file_sz; + + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + initrd = &initrds[i]; + p = filename; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') { + if (p >= filename + sizeof(filename)) + break; + + *p++ = *str++; + } + + *p = '\0'; + + /* Only open the volume once. */ + if (!i) { + efi_boot_services_t *boottime; + + boottime = sys_table->boottime; + + status = efi_call_phys3(boottime->handle_protocol, + image->device_handle, &fs_proto, &io); + if (status != EFI_SUCCESS) + goto free_initrds; + + status = efi_call_phys2(io->open_volume, io, &fh); + if (status != EFI_SUCCESS) + goto free_initrds; + } + + status = efi_call_phys5(fh->open, fh, &h, filename, + EFI_FILE_MODE_READ, (u64)0); + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->handle = h; + + info_sz = 0; + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, NULL); + if (status != EFI_BUFFER_TOO_SMALL) + goto close_handles; + +grow: + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, info_sz, &info); + if (status != EFI_SUCCESS) + goto close_handles; + + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, info); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, info); + goto grow; + } + + file_sz = info->file_size; + efi_call_phys1(sys_table->boottime->free_pool, info); + + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->size = file_sz; + initrd_total += file_sz; + } + + if (initrd_total) { + unsigned long addr; + + /* + * Multiple initrd's need to be at consecutive + * addresses in memory, so allocate enough memory for + * all the initrd's. + */ + status = high_alloc(initrd_total, 0x1000, + &initrd_addr, hdr->initrd_addr_max); + if (status != EFI_SUCCESS) + goto close_handles; + + /* We've run out of free low memory. */ + if (initrd_addr > hdr->initrd_addr_max) { + status = EFI_INVALID_PARAMETER; + goto free_initrd_total; + } + + addr = initrd_addr; + for (j = 0; j < nr_initrds; j++) { + u64 size; + + size = initrds[j].size; + status = efi_call_phys3(fh->read, initrds[j].handle, + &size, addr); + if (status != EFI_SUCCESS) + goto free_initrd_total; + + efi_call_phys1(fh->close, initrds[j].handle); + + addr += size; + } + + } + + efi_call_phys1(sys_table->boottime->free_pool, initrds); + + hdr->ramdisk_image = initrd_addr; + hdr->ramdisk_size = initrd_total; + + return status; + +free_initrd_total: + low_free(initrd_total, initrd_addr); + +close_handles: + for (k = j; k < nr_initrds; k++) + efi_call_phys1(fh->close, initrds[k].handle); +free_initrds: + efi_call_phys1(sys_table->boottime->free_pool, initrds); +fail: + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + return status; +} + +/* + * Because the x86 boot code expects to be passed a boot_params we + * need to create one ourselves (usually the bootloader would create + * one for us). + */ +static efi_status_t make_boot_params(struct boot_params *boot_params, + efi_loaded_image_t *image, + void *handle) +{ + struct efi_info *efi = &boot_params->efi_info; + struct apm_bios_info *bi = &boot_params->apm_bios_info; + struct sys_desc_table *sdt = &boot_params->sys_desc_table; + struct e820entry *e820_map = &boot_params->e820_map[0]; + struct e820entry *prev = NULL; + struct setup_header *hdr = &boot_params->hdr; + unsigned long size, key, desc_size, _size; + efi_memory_desc_t *mem_map; + void *options = image->load_options; + u32 load_options_size = image->load_options_size / 2; /* ASCII */ + int options_size = 0; + efi_status_t status; + __u32 desc_version; + unsigned long cmdline; + u8 nr_entries; + u16 *s2; + u8 *s1; + int i; + + hdr->type_of_loader = 0x21; + + /* Convert unicode cmdline to ascii */ + cmdline = 0; + s2 = (u16 *)options; + + if (s2) { + while (*s2 && *s2 != '\n' && options_size < load_options_size) { + s2++; + options_size++; + } + + if (options_size) { + if (options_size > hdr->cmdline_size) + options_size = hdr->cmdline_size; + + options_size++; /* NUL termination */ + + status = low_alloc(options_size, 1, &cmdline); + if (status != EFI_SUCCESS) + goto fail; + + s1 = (u8 *)(unsigned long)cmdline; + s2 = (u16 *)options; + + for (i = 0; i < options_size - 1; i++) + *s1++ = *s2++; + + *s1 = '\0'; + } + } + + hdr->cmd_line_ptr = cmdline; + + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + status = handle_ramdisks(image, hdr); + if (status != EFI_SUCCESS) + goto free_cmdline; + + setup_graphics(boot_params); + + /* Clear APM BIOS info */ + memset(bi, 0, sizeof(*bi)); + + memset(sdt, 0, sizeof(*sdt)); + + memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); + + size = sizeof(*mem_map) * 32; + +again: + size += sizeof(*mem_map); + _size = size; + status = low_alloc(size, 1, (unsigned long *)&mem_map); + if (status != EFI_SUCCESS) + goto free_cmdline; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, + mem_map, &key, &desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + low_free(_size, (unsigned long)mem_map); + goto again; + } + + if (status != EFI_SUCCESS) + goto free_mem_map; + + efi->efi_systab = (unsigned long)sys_table; + efi->efi_memdesc_size = desc_size; + efi->efi_memdesc_version = desc_version; + efi->efi_memmap = (unsigned long)mem_map; + efi->efi_memmap_size = size; + +#ifdef CONFIG_X86_64 + efi->efi_systab_hi = (unsigned long)sys_table >> 32; + efi->efi_memmap_hi = (unsigned long)mem_map >> 32; +#endif + + /* Might as well exit boot services now */ + status = efi_call_phys2(sys_table->boottime->exit_boot_services, + handle, key); + if (status != EFI_SUCCESS) + goto free_mem_map; + + /* Historic? */ + boot_params->alt_mem_k = 32 * 1024; + + /* + * Convert the EFI memory map to E820. + */ + nr_entries = 0; + for (i = 0; i < size / desc_size; i++) { + efi_memory_desc_t *d; + unsigned int e820_type = 0; + unsigned long m = (unsigned long)mem_map; + + d = (efi_memory_desc_t *)(m + (i * desc_size)); + switch (d->type) { + case EFI_RESERVED_TYPE: + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_MEMORY_MAPPED_IO: + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + case EFI_PAL_CODE: + e820_type = E820_RESERVED; + break; + + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + e820_type = E820_RAM; + break; + + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + + default: + continue; + } + + /* Merge adjacent mappings */ + if (prev && prev->type == e820_type && + (prev->addr + prev->size) == d->phys_addr) + prev->size += d->num_pages << 12; + else { + e820_map->addr = d->phys_addr; + e820_map->size = d->num_pages << 12; + e820_map->type = e820_type; + prev = e820_map++; + nr_entries++; + } + } + + boot_params->e820_entries = nr_entries; + + return EFI_SUCCESS; + +free_mem_map: + low_free(_size, (unsigned long)mem_map); +free_cmdline: + if (options_size) + low_free(options_size, hdr->cmd_line_ptr); +fail: + return status; +} + +/* + * On success we return a pointer to a boot_params structure, and NULL + * on failure. + */ +struct boot_params *efi_main(void *handle, efi_system_table_t *_table) +{ + struct boot_params *boot_params; + unsigned long start, nr_pages; + struct desc_ptr *gdt, *idt; + efi_loaded_image_t *image; + struct setup_header *hdr; + efi_status_t status; + efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; + struct desc_struct *desc; + + sys_table = _table; + + /* Check if we were booted by the EFI firmware */ + if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + goto fail; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &proto, (void *)&image); + if (status != EFI_SUCCESS) + goto fail; + + status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); + if (status != EFI_SUCCESS) + goto fail; + + memset(boot_params, 0x0, 0x4000); + + /* Copy first two sectors to boot_params */ + memcpy(boot_params, image->image_base, 1024); + + hdr = &boot_params->hdr; + + /* + * The EFI firmware loader could have placed the kernel image + * anywhere in memory, but the kernel has various restrictions + * on the max physical address it can run at. Attempt to move + * the kernel to boot_params.pref_address, or as low as + * possible. + */ + start = hdr->pref_address; + nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status != EFI_SUCCESS) { + status = low_alloc(hdr->init_size, hdr->kernel_alignment, + &start); + if (status != EFI_SUCCESS) + goto fail; + } + + hdr->code32_start = (__u32)start; + hdr->pref_address = (__u64)(unsigned long)image->image_base; + + memcpy((void *)start, image->image_base, image->image_size); + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*gdt), + (void **)&gdt); + if (status != EFI_SUCCESS) + goto fail; + + gdt->size = 0x800; + status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*idt), + (void **)&idt); + if (status != EFI_SUCCESS) + goto fail; + + idt->size = 0; + idt->address = 0; + + status = make_boot_params(boot_params, image, handle); + if (status != EFI_SUCCESS) + goto fail; + + memset((char *)gdt->address, 0x0, gdt->size); + desc = (struct desc_struct *)gdt->address; + + /* The first GDT is a dummy and the second is unused. */ + desc += 2; + + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + + desc++; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + +#ifdef CONFIG_X86_64 + /* Task segment value */ + desc++; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; +#endif /* CONFIG_X86_64 */ + + asm volatile ("lidt %0" : : "m" (*idt)); + asm volatile ("lgdt %0" : : "m" (*gdt)); + + asm volatile("cli"); + + return boot_params; +fail: + return NULL; +} diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h new file mode 100644 index 000000000000..f66d023e91ef --- /dev/null +++ b/arch/x86/boot/compressed/eboot.h @@ -0,0 +1,60 @@ +#ifndef BOOT_COMPRESSED_EBOOT_H +#define BOOT_COMPRESSED_EBOOT_H + +#define SEG_TYPE_DATA (0 << 3) +#define SEG_TYPE_READ_WRITE (1 << 1) +#define SEG_TYPE_CODE (1 << 3) +#define SEG_TYPE_EXEC_READ (1 << 1) +#define SEG_TYPE_TSS ((1 << 3) | (1 << 0)) +#define SEG_OP_SIZE_32BIT (1 << 0) +#define SEG_GRANULARITY_4KB (1 << 0) + +#define DESC_TYPE_CODE_DATA (1 << 0) + +#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) + +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol { + void *query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +struct efi_uga_draw_protocol { + void *get_mode; + void *set_mode; + void *blt; +}; + +#endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S new file mode 100644 index 000000000000..a53440e81d52 --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_32.S @@ -0,0 +1,86 @@ +/* + * EFI call stub for IA32. + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. Note that this implementation is different from the one in + * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical + * mode at this point. + */ + +#include +#include + +/* + * efi_call_phys(void *, ...) is a function with variable parameters. + * All the callers of this function assure that all the parameters are 4-bytes. + */ + +/* + * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. + * So we'd better save all of them at the beginning of this function and restore + * at the end no matter how many we use, because we can not assure EFI runtime + * service functions will comply with gcc calling convention, too. + */ + +.text +ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been + * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found + * the values of these registers are the same. And, the corresponding + * GDT entries are identical. So I will do nothing about segment reg + * and GDT, but change GDT base register in prelog and epilog. + */ + + /* + * 1. Because we haven't been relocated by this point we need to + * use relative addressing. + */ + call 1f +1: popl %edx + subl $1b, %edx + + /* + * 2. Now on the top of stack is the return + * address in the caller of efi_call_phys(), then parameter 1, + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ + popl %ecx + movl %ecx, saved_return_addr(%edx) + /* get the function pointer into ECX*/ + popl %ecx + movl %ecx, efi_rt_function_ptr(%edx) + + /* + * 3. Call the physical function. + */ + call *%ecx + + /* + * 4. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. We need to calculate our address + * again because %ecx and %edx are not preserved across EFI function + * calls. + */ + call 1f +1: popl %edx + subl $1b, %edx + + movl efi_rt_function_ptr(%edx), %ecx + pushl %ecx + + /* + * 10. Push the saved return address onto the stack and return. + */ + movl saved_return_addr(%edx), %ecx + pushl %ecx + ret +ENDPROC(efi_call_phys) +.previous + +.data +saved_return_addr: + .long 0 +efi_rt_function_ptr: + .long 0 diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S new file mode 100644 index 000000000000..cedc60de86eb --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -0,0 +1 @@ +#include "../../platform/efi/efi_stub_64.S" diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 67a655a39ce4..a0559930a180 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -32,6 +32,28 @@ __HEAD ENTRY(startup_32) +#ifdef CONFIG_EFI_STUB + /* + * We don't need the return address, so set up the stack so + * efi_main() can find its arugments. + */ + add $0x4, %esp + + call efi_main + cmpl $0, %eax + je preferred_addr + movl %eax, %esi + call 1f +1: + popl %eax + subl $1b, %eax + subl BP_pref_address(%esi), %eax + add BP_code32_start(%esi), %eax + leal preferred_addr(%eax), %eax + jmp *%eax + +preferred_addr: +#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 35af09d13dc1..558d76ce23bc 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -199,6 +199,26 @@ ENTRY(startup_64) * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. */ +#ifdef CONFIG_EFI_STUB + pushq %rsi + mov %rcx, %rdi + mov %rdx, %rsi + call efi_main + popq %rsi + cmpq $0,%rax + je preferred_addr + movq %rax,%rsi + call 1f +1: + popq %rax + subq $1b, %rax + subq BP_pref_address(%rsi), %rax + add BP_code32_start(%esi), %eax + leaq preferred_addr(%rax), %rax + jmp *%rax + +preferred_addr: +#endif /* Setup data segments. */ xorl %eax, %eax diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 19b3e693cd72..ffb9c5c9d748 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,2 +1,11 @@ #include "misc.h" + +int memcmp(const void *s1, const void *s2, size_t len) +{ + u8 diff; + asm("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} + #include "../string.c" diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index bdb4d458ec8c..f1bbeeb09148 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .global bootsect_start bootsect_start: +#ifdef CONFIG_EFI_STUB + # "MZ", MS-DOS header + .byte 0x4d + .byte 0x5a +#endif # Normalize the start address ljmp $BOOTSEG, $start2 @@ -79,6 +84,14 @@ bs_die: # invoke the BIOS reset code... ljmp $0xf000,$0xfff0 +#ifdef CONFIG_EFI_STUB + .org 0x3c + # + # Offset to the PE header. + # + .long pe_header +#endif /* CONFIG_EFI_STUB */ + .section ".bsdata", "a" bugger_off_msg: .ascii "Direct booting from floppy is no longer supported.\r\n" @@ -87,6 +100,141 @@ bugger_off_msg: .ascii "Remove disk and press any key to reboot . . .\r\n" .byte 0 +#ifdef CONFIG_EFI_STUB +pe_header: + .ascii "PE" + .word 0 + +coff_header: +#ifdef CONFIG_X86_32 + .word 0x14c # i386 +#else + .word 0x8664 # x86-64 +#endif + .word 2 # nr_sections + .long 0 # TimeDateStamp + .long 0 # PointerToSymbolTable + .long 1 # NumberOfSymbols + .word section_table - optional_header # SizeOfOptionalHeader +#ifdef CONFIG_X86_32 + .word 0x306 # Characteristics. + # IMAGE_FILE_32BIT_MACHINE | + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#else + .word 0x206 # Characteristics + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#endif + +optional_header: +#ifdef CONFIG_X86_32 + .word 0x10b # PE32 format +#else + .word 0x20b # PE32+ format +#endif + .byte 0x02 # MajorLinkerVersion + .byte 0x14 # MinorLinkerVersion + + # Filled in by build.c + .long 0 # SizeOfCode + + .long 0 # SizeOfInitializedData + .long 0 # SizeOfUninitializedData + + # Filled in by build.c + .long 0x0000 # AddressOfEntryPoint + + .long 0x0000 # BaseOfCode +#ifdef CONFIG_X86_32 + .long 0 # data +#endif + +extra_header_fields: +#ifdef CONFIG_X86_32 + .long 0 # ImageBase +#else + .quad 0 # ImageBase +#endif + .long 0x1000 # SectionAlignment + .long 0x200 # FileAlignment + .word 0 # MajorOperatingSystemVersion + .word 0 # MinorOperatingSystemVersion + .word 0 # MajorImageVersion + .word 0 # MinorImageVersion + .word 0 # MajorSubsystemVersion + .word 0 # MinorSubsystemVersion + .long 0 # Win32VersionValue + + # + # The size of the bzImage is written in tools/build.c + # + .long 0 # SizeOfImage + + .long 0x200 # SizeOfHeaders + .long 0 # CheckSum + .word 0xa # Subsystem (EFI application) + .word 0 # DllCharacteristics +#ifdef CONFIG_X86_32 + .long 0 # SizeOfStackReserve + .long 0 # SizeOfStackCommit + .long 0 # SizeOfHeapReserve + .long 0 # SizeOfHeapCommit +#else + .quad 0 # SizeOfStackReserve + .quad 0 # SizeOfStackCommit + .quad 0 # SizeOfHeapReserve + .quad 0 # SizeOfHeapCommit +#endif + .long 0 # LoaderFlags + .long 0x1 # NumberOfRvaAndSizes + + .quad 0 # ExportTable + .quad 0 # ImportTable + .quad 0 # ResourceTable + .quad 0 # ExceptionTable + .quad 0 # CertificationTable + .quad 0 # BaseRelocationTable + + # Section table +section_table: + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + + # + # The EFI application loader requires a relocation section + # because EFI applications are relocatable and not having + # this section seems to confuse it. But since we don't need + # the loader to fixup any relocs for us just fill it with a + # single dummy reloc. + # + .ascii ".reloc" + .byte 0 + .byte 0 + .long reloc_end - reloc_start + .long reloc_start + .long reloc_end - reloc_start # SizeOfRawData + .long reloc_start # PointerToRawData + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x42100040 # Characteristics (section flags) +#endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the # header, from the old boot sector. @@ -318,3 +466,13 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" + + .data +dummy: .long 0 + + .section .reloc +reloc_start: + .long dummy - reloc_start + .long 10 + .word 0 +reloc_end: diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 3cbc4058dd26..574dedfe2890 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } + +/** + * strlen - Find the length of a string + * @s: The string to be sized + */ +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} + +/** + * strstr - Find the first substring in a %NUL terminated string + * @s1: The string to be searched + * @s2: The string to search for + */ +char *strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index fdc60a0b3c20..4e9bd6bcafa6 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -135,6 +135,9 @@ static void usage(void) int main(int argc, char ** argv) { +#ifdef CONFIG_EFI_STUB + unsigned int file_sz, pe_header; +#endif unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -194,6 +197,42 @@ int main(int argc, char ** argv) buf[0x1f6] = sys_size >> 16; buf[0x1f7] = sys_size >> 24; +#ifdef CONFIG_EFI_STUB + file_sz = sz + i + ((sys_size * 16) - sz); + + pe_header = *(unsigned int *)&buf[0x3c]; + + /* Size of code */ + *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; + + /* Size of image */ + *(unsigned int *)&buf[pe_header + 0x50] = file_sz; + +#ifdef CONFIG_X86_32 + /* Address of entry point */ + *(unsigned int *)&buf[pe_header + 0x28] = i; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xb0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xb8] = file_sz; +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. + */ + *(unsigned int *)&buf[pe_header + 0x28] = i + 512; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xc0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xc8] = file_sz; +#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_EFI_STUB */ + crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) die("Writing setup failed"); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 4f13fafc5264..68de2dc962ec 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -67,4 +67,6 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + OFFSET(BP_code32_start, boot_params, hdr.code32_start); } -- cgit v1.2.3 From 3e8f9451d3db669d7c0d8b330d4f5770149d90d5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 15 Dec 2011 22:19:41 +0000 Subject: x86: Fix INTEL_MID silly Doh.. pass the brown paper bags - preferably filled with mince pies.. This fixes occasional build failures. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-r0oc1knlvzuqr69artaeq8s8@git.kernel.org [ extended the changelog a bit ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faf39a0d6242..2b54a2fb3ab0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -429,6 +429,7 @@ config X86_MDFLD select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES + select X86_INTEL_MID ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. -- cgit v1.2.3 From 2d2da60fb40a80cc59383121ccf763e0e0e8a42a Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 16 Dec 2011 13:30:58 +0100 Subject: x86, efi: Break up large initrd reads The efi boot stub tries to read the entire initrd in 1 go, however some efi implementations hang if too much if asked to read too much data at the same time. After some experimentation I found out that my asrock p67 board will hang if asked to read chunks of 4MiB, so use a safe value. elilo reads in chunks of 16KiB, but since that requires many read calls I use a value of 1 MiB. hpa suggested adding individual blacklists for when systems are found where this value causes a crash. Signed-off-by: Maarten Lankhorst Link: http://lkml.kernel.org/r/4EEB3A02.3090201@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 20 ++++++++++++++------ arch/x86/boot/compressed/eboot.h | 1 + 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4055e63d0b04..fec216f4fbc3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -643,14 +643,22 @@ grow: u64 size; size = initrds[j].size; - status = efi_call_phys3(fh->read, initrds[j].handle, - &size, addr); - if (status != EFI_SUCCESS) - goto free_initrd_total; + while (size) { + u64 chunksize; + if (size > EFI_READ_CHUNK_SIZE) + chunksize = EFI_READ_CHUNK_SIZE; + else + chunksize = size; + status = efi_call_phys3(fh->read, + initrds[j].handle, + &chunksize, addr); + if (status != EFI_SUCCESS) + goto free_initrd_total; + addr += chunksize; + size -= chunksize; + } efi_call_phys1(fh->close, initrds[j].handle); - - addr += size; } } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index f66d023e91ef..39251663e65b 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -12,6 +12,7 @@ #define DESC_TYPE_CODE_DATA (1 << 0) #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) +#define EFI_READ_CHUNK_SIZE (1024 * 1024) #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -- cgit v1.2.3 From a0c3832a578c84d4a93c61e22cb09c99fa9447ea Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 17 Dec 2011 21:57:25 +0000 Subject: x86/apb: Fix configuration constraints The APB timer requires SFI, SCU and MID support Reported-by: Ingo Molnar Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111217215719.3743.93550.stgit@bob.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2b54a2fb3ab0..ca4ee7644855 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,7 @@ config APB_TIMER def_bool y if MRST prompt "Langwell APB Timer Support" if X86_MRST select DW_APB_TIMER + depends on X86_INTEL_MID && SFI help APB timer is the replacement for 8254, HPET on X86 MID platforms. The APBT provides a stable time base on SMP -- cgit v1.2.3 From 933b9463a0ef75da681747b2dac06c1754465372 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 17 Dec 2011 17:43:40 +0000 Subject: x86/intel config: Revamp configuration to allow for Moorestown and Medfield This sets all up the other bits that need to be INTEL_MID specific rather than Moorestown specific. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111217174318.7207.91543.stgit@bob.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- arch/x86/include/asm/fixmap.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/pci/Makefile | 2 +- arch/x86/platform/mrst/Makefile | 4 ++-- drivers/rtc/Kconfig | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca4ee7644855..c3c9343e4498 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -662,8 +662,8 @@ config HPET_EMULATE_RTC depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER - def_bool y if MRST - prompt "Langwell APB Timer Support" if X86_MRST + def_bool y if X86_INTEL_MID + prompt "Intel MID APB Timer Support" if X86_INTEL_MID select DW_APB_TIMER depends on X86_INTEL_MID && SFI help diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 460c74e4852c..4da3c0c4c974 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -117,7 +117,7 @@ enum fixed_addresses { #endif FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif __end_of_permanent_fixed_addresses, diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9756551ec760..d0f19f9fb846 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID extern void x86_mrst_early_setup(void); #else static inline void x86_mrst_early_setup(void) { } diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 6b8759f7634e..75b06f34b1f2 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_MRST) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-y += common.o early.o obj-y += amd_bus.o bus_numa.o diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index ddeec7300464..7baed5135e0f 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_MRST) += mrst.o -obj-$(CONFIG_X86_MRST) += vrtc.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 53eb4e55b289..3a125b835546 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -498,9 +498,9 @@ config RTC_DRV_CMOS will be called rtc-cmos. config RTC_DRV_VRTC - tristate "Virtual RTC for Moorestown platforms" - depends on X86_MRST - default y if X86_MRST + tristate "Virtual RTC for Intel MID platforms" + depends on X86_INTEL_MID + default y if X86_INTEL_MID help Say "yes" here to get direct support for the real time clock -- cgit v1.2.3 From d79a8869d8a4b565b12a88faeff834b09a36368c Mon Sep 17 00:00:00 2001 From: Michael Demeter Date: Thu, 15 Dec 2011 22:31:23 +0000 Subject: x86/mrst: Add additional debug prints for pb_keys Added additional debug output that we always seem to add during power ons to validate firmware operation. Signed-off-by: Michael Demeter Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111215223116.10166.50803.stgit@bob.linux.org.uk [ fixed line breaks, formatting and commit title. ] Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6a21f603bd78..b6a33d2bd4d6 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -983,6 +983,7 @@ static int __init pb_keys_init(void) num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); for (i = 0; i < num; i++) { gb[i].gpio = get_gpio_by_name(gb[i].desc); + pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); if (gb[i].gpio == -1) continue; -- cgit v1.2.3 From 35edc2a5095efb189e60dc32bbb9d2663aec6d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:36:02 +0100 Subject: perf, arch: Rework perf_event_index() Put the logic to compute the event index into a per pmu method. This is required because the x86 rules are weird and wonderful and don't match the capabilities of the current scheme. AFAIK only powerpc actually has a usable userspace read of the PMCs but I'm not at all sure anybody actually used that. ARM is restored to the default since it currently does not support userspace access at all. And all software events are provided with a method that reports their index as 0 (disabled). Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Richard Kuo Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dfydxodki16lylkt3gl2j7cw@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/perf_event.h | 4 ---- arch/frv/include/asm/perf_event.h | 2 -- arch/hexagon/include/asm/perf_event.h | 2 -- arch/powerpc/include/asm/perf_event_server.h | 2 -- arch/powerpc/kernel/perf_event.c | 6 ++++++ arch/s390/include/asm/perf_event.h | 1 - arch/x86/include/asm/perf_event.h | 2 -- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 27 ++++++++++++++++++++++----- kernel/events/hw_breakpoint.c | 7 +++++++ 10 files changed, 41 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h index 0f8e3827a89b..08f94d8fc04c 100644 --- a/arch/arm/include/asm/perf_event.h +++ b/arch/arm/include/asm/perf_event.h @@ -12,10 +12,6 @@ #ifndef __ARM_PERF_EVENT_H__ #define __ARM_PERF_EVENT_H__ -/* ARM performance counters start from 1 (in the cp15 accesses) so use the - * same indexes here for consistency. */ -#define PERF_EVENT_INDEX_OFFSET 1 - /* ARM perf PMU IDs for use by internal perf clients. */ enum arm_perf_pmu_ids { ARM_PERF_PMU_ID_XSCALE1 = 0, diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h index a69e0155d146..c52ea5546b5b 100644 --- a/arch/frv/include/asm/perf_event.h +++ b/arch/frv/include/asm/perf_event.h @@ -12,6 +12,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/hexagon/include/asm/perf_event.h b/arch/hexagon/include/asm/perf_event.h index 6c2910f91180..8b8526b491c7 100644 --- a/arch/hexagon/include/asm/perf_event.h +++ b/arch/hexagon/include/asm/perf_event.h @@ -19,6 +19,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 8f1df1208d23..1a8093fa8f71 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -61,8 +61,6 @@ struct pt_regs; extern unsigned long perf_misc_flags(struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -#define PERF_EVENT_INDEX_OFFSET 1 - /* * Only override the default definitions in include/linux/perf_event.h * if we have hardware PMU support. diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 10a140f82cb8..d614ab57ccca 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1187,6 +1187,11 @@ static int power_pmu_event_init(struct perf_event *event) return err; } +static int power_pmu_event_idx(struct perf_event *event) +{ + return event->hw.idx; +} + struct pmu power_pmu = { .pmu_enable = power_pmu_enable, .pmu_disable = power_pmu_disable, @@ -1199,6 +1204,7 @@ struct pmu power_pmu = { .start_txn = power_pmu_start_txn, .cancel_txn = power_pmu_cancel_txn, .commit_txn = power_pmu_commit_txn, + .event_idx = power_pmu_event_idx, }; /* diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index a75f168d2718..4eb444edbe49 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -6,4 +6,3 @@ /* Empty, just to avoid compiling error */ -#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..9b922c136254 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,8 +188,6 @@ extern u32 get_ibs_caps(void); #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); -#define PERF_EVENT_INDEX_OFFSET 0 - /* * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. * This flag is otherwise unused and ABI specified to be 0, so nobody should diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb3..02545e6df95b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,6 +680,12 @@ struct pmu { * for each successful ->add() during the transaction. */ void (*cancel_txn) (struct pmu *pmu); /* optional */ + + /* + * Will return the value for perf_event_mmap_page::index for this event, + * if no implementation is provided it will default to: event->hw.idx + 1. + */ + int (*event_idx) (struct perf_event *event); /*optional */ }; /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 0ca1f648ac08..3894309c41a2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3208,10 +3208,6 @@ int perf_event_task_disable(void) return 0; } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) @@ -3220,7 +3216,7 @@ static int perf_event_index(struct perf_event *event) if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; + return event->pmu->event_idx(event); } static void calc_timer_values(struct perf_event *event, @@ -4992,6 +4988,11 @@ static int perf_swevent_init(struct perf_event *event) return 0; } +static int perf_swevent_event_idx(struct perf_event *event) +{ + return 0; +} + static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5001,6 +5002,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -5087,6 +5090,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -5306,6 +5311,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; /* @@ -5378,6 +5385,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -5405,6 +5414,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +static int perf_event_idx_default(struct perf_event *event) +{ + return event->hw.idx + 1; +} + /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. @@ -5594,6 +5608,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->event_idx) + pmu->event_idx = perf_event_idx_default; + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..b0309f76d777 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -613,6 +613,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ + return 0; +} + static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -622,6 +627,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From fe4a330885aee20f233de36085fb15c38094e635 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:44:06 +0100 Subject: perf, x86: Implement user-space RDPMC support, to allow fast, user-space access to self-monitoring counters Implement a correct pmu::event_idx for the x86 counter index rules and set CR4.PCE on CPU_STARTING. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Cc: Thomas Gleixner Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/n/tip-mwxab34dibqgzk5zywutfnha@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..53b569910175 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1210,6 +1210,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1542,6 +1543,18 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { + idx -= X86_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -1557,6 +1570,8 @@ static struct pmu pmu = { .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, }; /* -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 8 ++++++ include/linux/perf_event.h | 1 + kernel/events/core.c | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 53b569910175..116b040a73a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - set_in_cr4(X86_CR4_PCE); + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void) } } + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event) return idx + 1; } +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + NULL, +}; + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, + .attr_groups = x86_pmu_attr_groups, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..513d617b93c4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -307,6 +307,14 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; + /* + * sysfs attrs + */ + int attr_rdpmc; + + /* + * CPU Hotplug hooks + */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 05affc3878ff..dcd4049e92fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++ include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 116b040a73a8..f8bddb5b0600 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "perf_event.h" @@ -1627,6 +1628,19 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return; + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + return; + + userpg->time_mult = this_cpu_read(cyc2ns); + userpg->time_shift = CYC2NS_SCALE_FACTOR; + userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} + /* * callchain support */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dcd4049e92fc..3a9c7d81afbf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3 From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Nov 2011 12:44:55 -0800 Subject: x86: Do not schedule while still in NMI context The NMI handler uses the paranoid_exit routine that checks the NEED_RESCHED flag, and if it is set and the return is for userspace, then interrupts are enabled, the stack is swapped to the thread's stack, and schedule is called. The problem with this is that we are still in an NMI context until an iret is executed. This means that any new NMIs are now starved until an interrupt or exception occurs and does the iret. As NMIs can not be masked and can interrupt any location, they are treated as a special case. NEED_RESCHED should not be set in an NMI handler. The interruption by the NMI should not disturb the work flow for scheduling. Any IPI sent to a processor after sending the NEED_RESCHED would have to wait for the NMI anyway, and after the IPI finishes the schedule would be called as required. There is no reason to do anything special leaving an NMI. Remove the call to paranoid_exit and do a simple return. This not only fixes the bug of starved NMIs, but it also cleans up the code. Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com Acked-by: Andi Kleen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Paul Turner Signed-off-by: Linus Torvalds Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0b..3819ea907339 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1489,46 +1489,14 @@ ENTRY(nmi) movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace CFI_ENDPROC -#else - jmp paranoid_exit - CFI_ENDPROC -#endif END(nmi) ENTRY(ignore_sysret) -- cgit v1.2.3 From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:32:27 -0500 Subject: x86: Document the NMI handler about not using paranoid_exit Linus cleaned up the NMI handler but it still needs some comments to explain why it uses save_paranoid but not paranoid_exit. Just to keep others from adding that in the future, document why it's not used. Cc: Linus Torvalds Cc: Andi Kleen Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3819ea907339..d1d5434e7f6a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1480,9 +1480,16 @@ END(error_exit) ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ -- cgit v1.2.3 From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:36:23 -0500 Subject: x86: Add workaround to NMI iret woes In x86, when an NMI goes off, the CPU goes into an NMI context that prevents other NMIs to trigger on that CPU. If an NMI is suppose to trigger, it has to wait till the previous NMI leaves NMI context. At that time, the next NMI can trigger (note, only one more NMI will trigger, as only one can be latched at a time). The way x86 gets out of NMI context is by calling iret. The problem with this is that this causes problems if the NMI handle either triggers an exception, or a breakpoint. Both the exception and the breakpoint handlers will finish with an iret. If this happens while in NMI context, the CPU will leave NMI context and a new NMI may come in. As NMI handlers are not made to be re-entrant, this can cause havoc with the system, not to mention, the nested NMI will write all over the previous NMI's stack. Linus Torvalds proposed the following workaround to this problem: https://lkml.org/lkml/2010/7/14/264 "In fact, I wonder if we couldn't just do a software NMI disable instead? Hav ea per-cpu variable (in the _core_ percpu areas that get allocated statically) that points to the NMI stack frame, and just make the NMI code itself do something like NMI entry: - load percpu NMI stack frame pointer - if non-zero we know we're nested, and should ignore this NMI: - we're returning to kernel mode, so return immediately by using "popf/ret", which also keeps NMI's disabled in the hardware until the "real" NMI iret happens. - before the popf/iret, use the NMI stack pointer to make the NMI return stack be invalid and cause a fault - set the NMI stack pointer to the current stack pointer NMI exit (not the above "immediate exit because we nested"): clear the percpu NMI stack pointer Just do the iret. Now, the thing is, now the "iret" is atomic. If we had a nested NMI, we'll take a fault, and that re-does our "delayed" NMI - and NMI's will stay masked. And if we didn't have a nested NMI, that iret will now unmask NMI's, and everything is happy." I first tried to follow this advice but as I started implementing this code, a few gotchas showed up. One, is accessing per-cpu variables in the NMI handler. The problem is that per-cpu variables use the %gs register to get the variable for the given CPU. But as the NMI may happen in userspace, we must first perform a SWAPGS to get to it. The NMI handler already does this later in the code, but its too late as we have saved off all the registers and we don't want to do that for a disabled NMI. Peter Zijlstra suggested to keep all variables on the stack. This simplifies things greatly and it has the added benefit of cache locality. Two, faulting on the iret. I really wanted to make this work, but it was becoming very hacky, and I never got it to be stable. The iret already had a fault handler for userspace faulting with bad segment registers, and getting NMI to trigger a fault and detect it was very tricky. But for strange reasons, the system would usually take a double fault and crash. I never figured out why and decided to go with a simple "jmp" approach. The new approach I took also simplified things. Finally, the last problem with Linus's approach was to have the nested NMI handler do a ret instead of an iret to give the first NMI NMI-context again. The problem is that ret is much more limited than an iret. I couldn't figure out how to get the stack back where it belonged. I could have copied the current stack, pushed the return onto it, but my fear here is that there may be some place that writes data below the stack pointer. I know that is not something code should depend on, but I don't want to chance it. I may add this feature later, but for now, an NMI handler that loses NMI context will not get it back. Here's what is done: When an NMI comes in, the HW pushes the interrupt stack frame onto the per cpu NMI stack that is selected by the IST. A special location on the NMI stack holds a variable that is set when the first NMI handler runs. If this variable is set then we know that this is a nested NMI and we process the nested NMI code. There is still a race when this variable is cleared and an NMI comes in just before the first NMI does the return. For this case, if the variable is cleared, we also check if the interrupted stack is the NMI stack. If it is, then we process the nested NMI code. Why the two tests and not just test the interrupted stack? If the first NMI hits a breakpoint and loses NMI context, and then it hits another breakpoint and while processing that breakpoint we get a nested NMI. When processing a breakpoint, the stack changes to the breakpoint stack. If another NMI comes in here we can't rely on the interrupted stack to be the NMI stack. If the variable is not set and the interrupted task's stack is not the NMI stack, then we know this is the first NMI and we can process things normally. But in order to do so, we need to do a few things first. 1) Set the stack variable that tells us that we are in an NMI handler 2) Make two copies of the interrupt stack frame. One copy is used to return on iret The other is used to restore the first one if we have a nested NMI. This is what the stack will look like: +-------------------------+ | original SS | | original Return RSP | | original RFLAGS | | original CS | | original RIP | +-------------------------+ | temp storage for rdx | +-------------------------+ | NMI executing variable | +-------------------------+ | Saved SS | | Saved Return RSP | | Saved RFLAGS | | Saved CS | | Saved RIP | +-------------------------+ | copied SS | | copied Return RSP | | copied RFLAGS | | copied CS | | copied RIP | +-------------------------+ | pt_regs | +-------------------------+ The original stack frame contains what the HW put in when we entered the NMI. We store %rdx as a temp variable to use. Both the original HW stack frame and this %rdx storage will be clobbered by nested NMIs so we can not rely on them later in the first NMI handler. The next item is the special stack variable that is set when we execute the rest of the NMI handler. Then we have two copies of the interrupt stack. The second copy is modified by any nested NMIs to let the first NMI know that we triggered a second NMI (latched) and that we should repeat the NMI handler. If the first NMI hits an exception or breakpoint that takes it out of NMI context, if a second NMI comes in before the first one finishes, it will update the copied interrupt stack to point to a fix up location to trigger another NMI. When the first NMI calls iret, it will instead jump to the fix up location. This fix up location will copy the saved interrupt stack back to the copy and execute the nmi handler again. Note, the nested NMI knows enough to check if it preempted a previous NMI handler while it is in the fixup location. If it has, it will not modify the copied interrupt stack and will just leave as if nothing happened. As the NMI handle is about to execute again, there's no reason to latch now. To test all this, I forced the NMI handler to call iret and take itself out of NMI context. I also added assemble code to write to the serial to make sure that it hits the nested path as well as the fix up path. Everything seems to be working fine. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d1d5434e7f6a..b62aa298df7f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1475,11 +1475,166 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1502,10 +1657,32 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v1.2.3 From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2011 03:02:19 -0500 Subject: x86: Keep current stack in NMI breakpoints We want to allow NMI handlers to have breakpoints to be able to remove stop_machine from ftrace, kprobes and jump_labels. But if an NMI interrupts a current breakpoint, and then it triggers a breakpoint itself, it will switch to the breakpoint stack and corrupt the data on it for the breakpoint processing that it interrupted. Instead, have the NMI check if it interrupted breakpoint processing by checking if the stack that is currently used is a breakpoint stack. If it is, then load a special IDT that changes the IST for the debug exception to keep the same stack in kernel context. When the NMI is done, it puts it back. This way, if the NMI does trigger a breakpoint, it will keep using the same stack and not stomp on the breakpoint data for the breakpoint it interrupted. Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/include/asm/desc.h | 12 ++++++++++++ arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/kernel/head_64.S | 4 ++++ arch/x86/kernel/nmi.c | 15 +++++++++++++++ arch/x86/kernel/traps.c | 6 ++++++ 6 files changed, 65 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fadfdfc..e95822d683f4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435ffb53..4b39d6d7e3a1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -416,6 +419,9 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a831..caa404556b9c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1090,6 +1092,24 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); + +int is_debug_stack(unsigned long addr) +{ + return addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a49..40f4eb3766d1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58ddd..de8d4b333f40 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + int update_debug_stack = 0; + + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + update_debug_stack = 1; + } nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + if (unlikely(update_debug_stack)) + debug_stack_reset(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466c..a93c5cabc36a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -723,4 +723,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } -- cgit v1.2.3 From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Dec 2011 16:44:16 -0500 Subject: x86: Allow NMIs to hit breakpoints in i386 With i386, NMIs and breakpoints use the current stack and they do not reset the stack pointer to a fix point that might corrupt a previous NMI or breakpoint (as it does in x86_64). But NMIs are still not made to be re-entrant, and need to prevent the case that an NMI hitting a breakpoint (which does an iret), doesn't allow another NMI to run. The fix is to let the NMI be in 3 different states: 1) not running 2) executing 3) latched When no NMI is executing on a given CPU, the state is "not running". When the first NMI comes in, the state is switched to "executing". On exit of that NMI, a cmpxchg is performed to switch the state back to "not running" and if that fails, the NMI is restarted. If a breakpoint is hit and does an iret, which re-enables NMIs, and another NMI comes in before the first NMI finished, it will detect that the state is not in the "not running" state and the current NMI is nested. In this case, the state is switched to "latched" to let the interrupted NMI know to restart the NMI handler, and the nested NMI exits without doing anything. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index de8d4b333f40..47acaf319165 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - int update_debug_stack = 0; +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ /* * If we interrupted a breakpoint, it is possible that * the nmi handler will have breakpoints too. We need to @@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - update_debug_stack = 1; + __get_cpu_var(update_debug_stack) = 1; } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_exit(); - if (unlikely(update_debug_stack)) - debug_stack_reset(); + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) -- cgit v1.2.3 From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 11:43:02 -0500 Subject: x86: Add counter when debug stack is used with interrupts enabled Mathieu Desnoyers pointed out a case that can cause issues with NMIs running on the debug stack: int3 -> interrupt -> NMI -> int3 Because the interrupt changes the stack, the NMI will not see that it preempted the debug stack. Looking deeper at this case, interrupts only happen when the int3 is from userspace or in an a location in the exception table (fixup). userspace -> int3 -> interurpt -> NMI -> int3 All other int3s that happen in the kernel should be processed without ever enabling interrupts, as the do_trap() call will panic the kernel if it is called to process any other location within the kernel. Adding a counter around the sections that enable interrupts while using the debug stack allows the NMI to also check that case. If the NMI sees that it either interrupted a task using the debug stack or the debug counter is non-zero, then it will have to change the IDT table to make the int3 not change stacks (which will corrupt the stack if it does). Note, I had to move the debug_usage functions out of processor.h and into debugreg.h because of the static inlined functions to inc and dec the debug_usage counter. __get_cpu_var() requires smp.h which includes processor.h, and would fail to build. Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com Reported-by: Mathieu Desnoyers Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- arch/x86/include/asm/debugreg.h | 22 ++++++++++++++++++++++ arch/x86/include/asm/processor.h | 6 ------ arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kernel/traps.c | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0caefc6..b903d5ea3941 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(int, debug_stack_usage); +static inline void debug_stack_usage_inc(void) +{ + __get_cpu_var(debug_stack_usage)++; +} +static inline void debug_stack_usage_dec(void) +{ + __get_cpu_var(debug_stack_usage)--; +} +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); +#else /* !X86_64 */ +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } +static inline void debug_stack_usage_inc(void) { } +static inline void debug_stack_usage_dec(void) { } +#endif /* X86_64 */ + + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4b39d6d7e3a1..b650435ffb53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); -int is_debug_stack(unsigned long addr); -void debug_stack_set_zero(void); -void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -419,9 +416,6 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -static inline int is_debug_stack(unsigned long addr) { return 0; } -static inline void debug_stack_set_zero(void) { } -static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index caa404556b9c..266e4649b1da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,11 +1093,13 @@ unsigned long kernel_eflags; DEFINE_PER_CPU(struct orig_ist, orig_ist); static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); int is_debug_stack(unsigned long addr) { - return addr <= __get_cpu_var(debug_stack_addr) && - addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } void debug_stack_set_zero(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a93c5cabc36a..0072b38e3ea1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } -- cgit v1.2.3 From 7c9c3a1e5fc8728e948b8fa3cbcfcfb86db3afda Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 29 Dec 2011 14:43:16 +0000 Subject: x86/intel config: Fix the APB_TIMER selection Seems Kconfig SELECT isn't selecting things hierarchically when selected. config APB_TIMER def_bool y if X86_INTEL_MID prompt "Intel MID APB Timer Support" if X86_INTEL_MID select DW_APB_TIMER depends on X86_INTEL_MID && SFI when we select APB_TIMER doesn't select DW_APB_TIMER so do it by hand. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-kpnaimplltk6d1lolusqj3ae@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07620bc913db..78fbb346959b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -409,12 +409,14 @@ config X86_MRST depends on PCI depends on PCI_GOANY depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER select APB_TIMER select I2C select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES - select X86_INTEL_MID ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -428,12 +430,14 @@ config X86_MDFLD depends on PCI depends on PCI_GOANY depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER select APB_TIMER select I2C select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES - select X86_INTEL_MID ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. -- cgit v1.2.3 From cd42f4a3b2b1c4cbd997363dc57821953d73fd87 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 15 Dec 2011 10:48:12 -0800 Subject: HWPOISON: Clean up memory_failure() vs. __memory_failure() There is only one caller of memory_failure(), all other users call __memory_failure() and pass in the flags argument explicitly. The lone user of memory_failure() will soon need to pass flags too. Add flags argument to the callsite in mce.c. Delete the old memory_failure() function, and then rename __memory_failure() without the leading "__". Provide clearer message when action optional memory errors are ignored. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 12 +++++++---- drivers/base/memory.c | 2 +- include/linux/mm.h | 3 +-- mm/hwpoison-inject.c | 4 ++-- mm/madvise.c | 2 +- mm/memory-failure.c | 46 ++++++++++++++++++---------------------- 6 files changed, 34 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d1..1a08ce5f345f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1046,11 +1046,15 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + + return 0; } +#endif /* * Called after mce notification in process context. This code @@ -1068,7 +1072,7 @@ void mce_notify_process(void) unsigned long pfn; mce_notify_irq(); while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + memory_failure(pfn, MCE_VECTOR, 0); } static void mce_process_work(struct work_struct *dummy) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8272d92d22c0..9a924440053f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -474,7 +474,7 @@ store_hard_offline_page(struct class *class, if (strict_strtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - ret = __memory_failure(pfn, 0, 0); + ret = memory_failure(pfn, 0, 0); return ret ? ret : count; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4baadd18f4ad..bcc523474724 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1607,8 +1607,7 @@ void vmemmap_populate_print_last(void); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, }; -extern void memory_failure(unsigned long pfn, int trapno); -extern int __memory_failure(unsigned long pfn, int trapno, int flags); +extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c7fc7fd00e32..cc448bb983ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val) * do a racy check with elevated page count, to make sure PG_hwpoison * will only be set for the targeted owner (or on a free page). * We temporarily take page lock for try_get_mem_cgroup_from_page(). - * __memory_failure() will redo the check reliably inside page lock. + * memory_failure() will redo the check reliably inside page lock. */ lock_page(hpage); err = hwpoison_filter(hpage); @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) inject: printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); - return __memory_failure(pfn, 18, MF_COUNT_INCREASED); + return memory_failure(pfn, 18, MF_COUNT_INCREASED); } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/madvise.c b/mm/madvise.c index 74bf193eff04..f5ab745672b7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06d3479513aa..ab259bb0adc5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -984,7 +984,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) ClearPageHWPoison(hpage + i); } -int __memory_failure(unsigned long pfn, int trapno, int flags) +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; @@ -1156,29 +1174,7 @@ out: unlock_page(hpage); return res; } -EXPORT_SYMBOL_GPL(__memory_failure); - -/** - * memory_failure - Handle memory failure of a page. - * @pfn: Page Number of the corrupted page - * @trapno: Trap number reported in the signal to user space. - * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. - * - * The function is primarily of use for corruptions that - * happen outside the current execution context (e.g. when - * detected by a background scrubber) - * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. - */ -void memory_failure(unsigned long pfn, int trapno) -{ - __memory_failure(pfn, trapno, 0); -} +EXPORT_SYMBOL_GPL(memory_failure); #define MEMORY_FAILURE_FIFO_ORDER 4 #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) @@ -1251,7 +1247,7 @@ static void memory_failure_work_func(struct work_struct *work) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - __memory_failure(entry.pfn, entry.trapno, entry.flags); + memory_failure(entry.pfn, entry.trapno, entry.flags); } } -- cgit v1.2.3 From 85f92694affa7dba7f1978666a69552b5dfc628e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 13 Dec 2011 09:48:13 -0800 Subject: x86/mce: Create helper function to save addr/misc when needed The MCI_STATUS_MISCV and MCI_STATUS_ADDRV bits in the bank status registers define whether the MISC and ADDR registers respectively contain valid data - provide a helper function to check these bits and read the registers when needed. In addition, processors that support software error recovery (as indicated by the MCG_SER_P bit in the MCG_CAP register) may include some undefined bits in the ADDR register - mask these out. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1a08ce5f345f..2f1c200f05e6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -492,6 +492,27 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&__get_cpu_var(mce_irq_work)); } +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -542,10 +563,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -981,10 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (severity == MCE_AR_SEVERITY) kill_it = 1; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. -- cgit v1.2.3 From af104e394e17e328df85c25a9e21448539725b67 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 14 Dec 2011 15:55:20 -0800 Subject: x86/mce: Add mechanism to safely save information in MCE handler Machine checks on Intel cpus interrupt execution on all cpus, regardless of interrupt masking. We have a need to save some data about the cause of the machine check (physical address) in the machine check handler that can be retrieved later to attempt recovery in a more flexible execution state. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2f1c200f05e6..e1579c5a71da 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -886,6 +886,49 @@ static void mce_clear_state(unsigned long *toclear) } } +/* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. -- cgit v1.2.3 From a8c321fbf9aeced45519248e5901af8cbc240510 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Jan 2012 11:45:45 -0800 Subject: x86/mce: Handle "action required" errors All non-urgent actions (reporting low severity errors and handling "action-optional" errors) are now handled by a work queue. This means that TIF_MCE_NOTIFY can be used to block execution for a thread experiencing an "action-required" fault until we get all cpus out of the machine check handler (and the thread that hit the fault into mce_notify_process(). We use the new mce_{save,find,clear}_info() API to get information from do_machine_check() to mce_notify_process(), and then use the newly improved memory_failure(..., MF_ACTION_REQUIRED) to handle the error (possibly signalling the process). Update some comments to make the new code flows clearer. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 95 ++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e1579c5a71da..56e4e79387c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -982,7 +982,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -1036,12 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - mce_read_aux(&m, i); /* @@ -1062,6 +1058,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1073,27 +1072,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); @@ -1107,6 +1101,8 @@ EXPORT_SYMBOL_GPL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int vector, int flags) { + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); @@ -1115,27 +1111,44 @@ int memory_failure(unsigned long pfn, int vector, int flags) #endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR, 0); + struct mce_info *mi = mce_find_info(); + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1225,8 +1238,6 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); -- cgit v1.2.3 From 5f7b88d51e89771f64c15903b96b5933dd0bc6d8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Jan 2012 11:48:04 -0800 Subject: x86/mce: Recognise machine check bank signature for data path error Action required data path signature is defined in table 15-19 of SDM: +-----------------------------------------------------------------------------+ | SRAR Error | Valid | OVER | UC | EN | MISCV | ADDRV | PCC | S | AR | MCACOD | | Data Load | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0x134 | +-----------------------------------------------------------------------------+ Recognise this, and pass MCE_AR_SEVERITY code back to do_machine_check() if we have the action handler configured (CONFIG_MEMORY_FAILURE=y) Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272d..f6c92f99efa0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -54,6 +54,7 @@ static struct severity { #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) #define MCACOD 0xffff MCESEV( @@ -102,11 +103,24 @@ static struct severity { SER, BITCLR(MCI_STATUS_S) ), - /* AR add known MCACODs here */ MCESEV( PANIC, "Action required with lost events", SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "HT thread notices Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + USER + ), +#endif MCESEV( PANIC, "Action required: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) -- cgit v1.2.3 From 426932909093e4e7729777a0e2beed4b54911361 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 5 Jan 2012 16:12:25 +0000 Subject: x86-64: Slightly shorten copy_page() %r13 got saved and restored without ever getting touched, so there's no need to do so. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/4F05D9F9020000780006AA0D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/lib/copy_page_64.S | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 01c805ba5359..6b34d04d096a 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -20,14 +20,12 @@ ENDPROC(copy_page_c) ENTRY(copy_page) CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 + subq $2*8,%rsp + CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx,(%rsp) CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -91,10 +89,8 @@ ENTRY(copy_page) CFI_RESTORE rbx movq 1*8(%rsp),%r12 CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 + addq $2*8,%rsp + CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: CFI_ENDPROC -- cgit v1.2.3 From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 6 Jan 2012 11:17:51 -0500 Subject: x86, reboot: Fix typo in nmi reboot path It was brought to my attention that my x86 change to use NMI in the reboot path broke Intel Nehalem and Westmere boxes when using kexec. I realized I had mistyped the if statement in commit 3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in the wrong spot. Putting it in the right spot fixes kexec again. Doh. Reported-by: Yinghai Lu Cc: Linus Torvalds Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 113acda5879e..66c74f481cab 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait) */ if (num_online_cpus() > 1) { /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) return; if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, -- cgit v1.2.3 From 72142fd4109105c6bd21658966ca5e93c1684081 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 7 Jan 2012 14:10:18 -0800 Subject: x86: Move from trace_syscalls.c to asm/syscall.h This reverts commit d5e553d6e0a4bdea43adae7373e3fa144b9a1aaa, which caused large numbers of build warnings on PowerPC. This moves the #include to , which makes some kind of sense since NR_syscalls is syscalls related. Reported-by: Stephen Rothwell Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/20111214181545.6e13bc954cb7ddce9086e861@canb.auug.org.au --- arch/x86/include/asm/syscall.h | 1 + kernel/trace/trace_syscalls.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index c4a348f7bd43..d962e5652a73 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -15,6 +15,7 @@ #include #include +#include /* For NR_syscalls */ extern const unsigned long sys_call_table[]; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5f35f6f15d99..cb654542c1a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -6,7 +6,6 @@ #include #include #include -#include #include "trace_output.h" #include "trace.h" -- cgit v1.2.3 From da517a08ac5913cd80ce3507cddd00f2a091b13c Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 6 Jan 2012 13:19:00 -0600 Subject: x86, UV: Update Boot messages for SGI UV2 platform SGI UV systems print a message during boot: UV: Found blades Due to packaging changes, the blade count is not accurate for on the next generation of the platform. This patch corrects the count. Signed-off-by: Jack Steiner Cc: Link: http://lkml.kernel.org/r/20120106191900.GA19772@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 9d59bbacd4e3..79b05b88aa19 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -769,7 +769,12 @@ void __init uv_system_init(void) for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) uv_possible_blades += hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); - printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); + + /* uv_num_possible_blades() is really the hub count */ + printk(KERN_INFO "UV: Found %d blades, %d hubs\n", + is_uv1_hub() ? uv_num_possible_blades() : + (uv_num_possible_blades() + 1) / 2, + uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kzalloc(bytes, GFP_KERNEL); -- cgit v1.2.3 From 8030c36d13f030103356709e63638678fdc66fdc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 9 Jan 2012 19:33:24 -0800 Subject: x86, atomic: atomic64_read() take a const pointer atomic64_read() doesn't actually write anything (as far as the C environment is concerned... the CPU does actually write but that's an implementation quirk), so it should take a const pointer. This does NOT mean that it is safe to use atomic64_read() on an object in readonly storage (it will trap!) Reported-by: Andrew Morton Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/20120109165859.1879abda.akpm@linux-foundation.org --- arch/x86/include/asm/atomic64_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 24098aafce0d..fa13f0ec2874 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -82,7 +82,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) * * Atomically reads the value of @v and returns it. */ -static inline long long atomic64_read(atomic64_t *v) +static inline long long atomic64_read(const atomic64_t *v) { long long r; asm volatile(ATOMIC64_ALTERNATIVE(read) -- cgit v1.2.3 From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001 From: Stratos Psomadakis Date: Thu, 12 Jan 2012 15:44:47 +1030 Subject: lguest: Make sure interrupt is allocated ok by lguest_setup_irq Make sure the interrupt is allocated correctly by lguest_setup_irq (check the return value of irq_alloc_desc_at for -ENOMEM) Signed-off-by: Stratos Psomadakis Signed-off-by: Rusty Russell (cleanups and commentry) --- arch/x86/lguest/boot.c | 21 +++++++++++++-------- drivers/lguest/lguest_device.c | 10 +++++++--- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cf4603ba866f..642d8805bc1b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void) } /* - * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so - * rather than set them in lguest_init_IRQ we are called here every time an - * lguest device needs an interrupt. - * - * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should - * pass that up! + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. */ -void lguest_setup_irq(unsigned int irq) +int lguest_setup_irq(unsigned int irq) { - irq_alloc_desc_at(irq, 0); + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); + return 0; } /* diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 6a1d6447b864..9e8388efd88e 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -241,7 +241,7 @@ static void lg_notify(struct virtqueue *vq) } /* An extern declaration inside a C file is bad form. Don't do it. */ -extern void lguest_setup_irq(unsigned int irq); +extern int lguest_setup_irq(unsigned int irq); /* * This routine finds the Nth virtqueue described in the configuration of @@ -304,7 +304,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, } /* Make sure the interrupt is allocated. */ - lguest_setup_irq(lvq->config.irq); + err = lguest_setup_irq(lvq->config.irq); + if (err) + goto destroy_vring; /* * Tell the interrupt for this virtqueue to go to the virtio_ring @@ -317,7 +319,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) - goto destroy_vring; + goto free_desc; /* * Last of all we hook up our 'struct lguest_vq_info" to the @@ -326,6 +328,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, vq->priv = lvq; return vq; +free_desc: + irq_free_desc(lvq->config.irq); destroy_vring: vring_del_virtqueue(vq); unmap: -- cgit v1.2.3 From 5cf9a4e69c1ff0ccdd1d2b7404f95c0531355274 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 12 Jan 2012 08:01:40 -0700 Subject: x86/PCI: build amd_bus.o only when CONFIG_AMD_NB=y We only need amd_bus.o for AMD systems with PCI. arch/x86/pci/Makefile already depends on CONFIG_PCI=y, so this patch just adds the dependency on CONFIG_AMD_NB. Cc: Yinghai Lu Cc: stable@kernel.org # 2.6.34+ (needs adjustment for k8 -> amd rename) Signed-off-by: Bjorn Helgaas Signed-off-by: Linus Torvalds --- arch/x86/pci/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 75b06f34b1f2..e76e18c94a3c 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-y += common.o early.o -obj-y += amd_bus.o bus_numa.o +obj-y += bus_numa.o +obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o ifeq ($(CONFIG_PCI_DEBUG),y) -- cgit v1.2.3 From bccd17294a26b67a8a19aaa120e3eeaa7da49281 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 11 Jan 2012 05:11:46 +0400 Subject: x86: Get rid of 'dubious one-bit signed bitfield' sprase warning This very noisy sparse warning appears on almost every file in the kernel: CHECK init/main.c arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield This patch changes sig_on_uaccess_error and uaccess_err flags to unsigned type and thus fixes the warning. Signed-off-by: Anton Vorontsov Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- arch/x86/include/asm/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 74047159d0ab..bc817cd8b443 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,8 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int sig_on_uaccess_error:1; - int uaccess_err:1; /* uaccess failed */ + unsigned int sig_on_uaccess_error:1; + unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ -- cgit v1.2.3 From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: module_param: make bool parameters really bool (arch) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. Signed-off-by: Rusty Russell --- arch/ia64/hp/common/aml_nfw.c | 2 +- arch/x86/kernel/apm_32.c | 16 ++++++++-------- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/x86.c | 4 ++-- arch/x86/mm/mmio-mod.c | 4 ++-- arch/x86/platform/geode/alix.c | 2 +- arch/x86/platform/iris/iris.c | 2 +- 8 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c index 22078486d35d..6192f7188654 100644 --- a/arch/ia64/hp/common/aml_nfw.c +++ b/arch/ia64/hp/common/aml_nfw.c @@ -31,7 +31,7 @@ MODULE_AUTHOR("Bjorn Helgaas "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ACPI opregion handler for native firmware calls"); -static int force_register; +static bool force_register; module_param_named(force, force_register, bool, 0); MODULE_PARM_DESC(force, "Install opregion handler even without HPQ5001 device"); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd383953c..f76623cbe263 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly; static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static bool power_off; #else -static int power_off = 1; +static bool power_off = 1; #endif -static int realmode_power_off; +static bool realmode_power_off; #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1; #else -static int allow_ints; +static bool allow_ints; #endif -static int broken_psr; +static bool broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2a2a9b40db19..224b02c3cda9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -74,7 +74,7 @@ enum { #endif #ifdef MMU_DEBUG -static int dbg = 0; +static bool dbg = 0; module_param(dbg, bool, 0644); #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 906a7e84200f..d29216c462b3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,29 +51,29 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int __read_mostly enable_vpid = 1; +static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -static int __read_mostly flexpriority_enabled = 1; +static bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int __read_mostly enable_ept = 1; +static bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); -static int __read_mostly enable_unrestricted_guest = 1; +static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static int __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static int __read_mostly vmm_exclusive = 1; +static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static int __read_mostly yield_on_hlt = 1; +static bool __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); -static int __read_mostly fasteoi = 1; +static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); /* @@ -81,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO); * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static int __read_mostly nested = 0; +static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1171def5f96b..14d6cadc4ba6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -88,8 +88,8 @@ static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); -int ignore_msrs = 0; -module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool ignore_msrs = 0; +module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index de54b9b278a7..dc0b727742f4 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ /* module parameters */ static unsigned long filter_offset; -static int nommiotrace; -static int trace_pc; +static bool nommiotrace; +static bool trace_pc; module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index ca1973699d3d..dc5f1d32aced 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -27,7 +27,7 @@ #include -static int force = 0; +static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ba7f5ed8c9b..5917eb56b313 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer "); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); -static int force; +static bool force; module_param(force, bool, 0); MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); -- cgit v1.2.3 From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:27 -0800 Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL While implementing cmpxchg_double() on s390 I realized that we don't set CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it. However setting that option will increase the size of struct page by eight bytes on 64 bit, which we certainly do not want. Also, it doesn't make sense that a present cpu feature should increase the size of struct page. Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and that it should depend on CMPXCHG_DOUBLE instead. This patch: If an architecture supports CMPXCHG_LOCAL this shouldn't result automatically in larger struct pages if the SLUB allocator is used. Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which can be selected if a double word aligned struct page is required. Also update x86 Kconfig so that it should work as before. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 8 ++++++++ arch/x86/Kconfig | 1 + include/linux/mm_types.h | 9 ++++----- mm/slub.c | 6 +++--- 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 2505740b81d2..a2c5c077c32d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -185,4 +185,12 @@ config HAVE_RCU_TABLE_FREE config ARCH_HAVE_NMI_SAFE_CMPXCHG bool +config HAVE_ALIGNED_STRUCT_PAGE + bool + help + This makes sure that struct pages are double word aligned and that + e.g. the SLUB allocator can perform double word atomic operations + on a struct page for better performance. However selecting this + might increase the size of a struct page by a word. + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a150f4c35e94..5201a2c27239 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select ANON_INODES + select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b42f1b34eb7..3cc3062b3767 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -151,12 +151,11 @@ struct page { #endif } /* - * If another subsystem starts using the double word pairing for atomic - * operations on struct page then it must change the #if to ensure - * proper alignment of the page struct. + * The struct page can be forced to be double word aligned so that atomic ops + * on double words work. The SLUB allocator can make use of such a feature. */ -#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) - __attribute__((__aligned__(2*sizeof(unsigned long)))) +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE + __aligned(2 * sizeof(unsigned long)) #endif ; diff --git a/mm/slub.c b/mm/slub.c index 5d37b5e44140..72aa84134609 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +400,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3014,7 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#ifdef CONFIG_CMPXCHG_DOUBLE +#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; -- cgit v1.2.3 From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:30 -0800 Subject: mm,x86,um: move CMPXCHG_LOCAL config option Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- mm/vmstat.c | 2 +- 5 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index a2c5c077c32d..22182a8cc62c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -193,4 +193,7 @@ config HAVE_ALIGNED_STRUCT_PAGE on a struct page for better performance. However selecting this might increase the size of a struct page by a word. +config HAVE_CMPXCHG_LOCAL + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5201a2c27239..59717fd17bc7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 + select HAVE_CMPXCHG_LOCAL if !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0d858c..99d2ab8b7795 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_LOCAL - def_bool X86_64 || (X86_32 && !M386) - config CMPXCHG_DOUBLE def_bool y diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd84b6fb..a62bfc66239e 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_LOCAL - bool - default n - config CMPXCHG_DOUBLE bool default n diff --git a/mm/vmstat.c b/mm/vmstat.c index 8fd603b1665e..f600557a7659 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); -#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. -- cgit v1.2.3 From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:33 -0800 Subject: mm,x86,um: move CMPXCHG_DOUBLE config option Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- mm/slub.c | 9 ++++++--- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 22182a8cc62c..4f55c736be11 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,4 +196,7 @@ config HAVE_ALIGNED_STRUCT_PAGE config HAVE_CMPXCHG_LOCAL bool +config HAVE_CMPXCHG_DOUBLE + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 59717fd17bc7..6c14ecd851d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 99d2ab8b7795..3c57033e2211 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_DOUBLE - def_bool y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a62bfc66239e..b2b54d2edf53 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_DOUBLE - bool - default n - source "arch/x86/Kconfig.cpu" endmenu diff --git a/mm/slub.c b/mm/slub.c index 72aa84134609..4907563ef7ff 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page const char *n) { VM_BUG_ON(!irqs_disabled()); -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_new, unsigned long counters_new, const char *n) { -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, @@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s, } } -#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; -- cgit v1.2.3 From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 12 Jan 2012 17:20:09 -0800 Subject: cpumask: update setup_node_to_cpumask_map() comments node_to_cpumask() has been replaced by cpumask_of_node(), and wholly removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask now everyone uses cpumask_of_node"). So update the comments for setup_node_to_cpumask_map(). Signed-off-by: Wanlong Gao Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/numa.c | 2 +- arch/x86/mm/numa.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4ff3d8e411a7..3feefc3842a8 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. */ static void __init setup_node_to_cpumask_map(void) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 020cd2e80873..19d3fa08b119 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) -- cgit v1.2.3 From a522ee85ba979e7897a75b1c97db1b0304b68b5c Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 20 Dec 2011 12:20:16 +0200 Subject: crypto: twofish-x86_64-3way - blacklist pentium4 and atom Performance of twofish-x86_64-3way on Intel Pentium 4 and Atom is lower than of twofish-x86_64 module. So blacklist these CPUs. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 7fee8c152f93..0afd134d8c9c 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -25,6 +25,7 @@ * */ +#include #include #include #include @@ -637,10 +638,56 @@ static struct crypto_alg blk_xts_alg = { }, }; +static bool is_blacklisted_cpu(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (boot_cpu_data.x86 == 0x06 && + (boot_cpu_data.x86_model == 0x1c || + boot_cpu_data.x86_model == 0x26 || + boot_cpu_data.x86_model == 0x36)) { + /* + * On Atom, twofish-3way is slower than original assembler + * implementation. Twofish-3way trades off some performance in + * storing blocks in 64bit registers to allow three blocks to + * be processed parallel. Parallel operation then allows gaining + * more performance than was trade off, on out-of-order CPUs. + * However Atom does not benefit from this parallellism and + * should be blacklisted. + */ + return true; + } + + if (boot_cpu_data.x86 == 0x0f) { + /* + * On Pentium 4, twofish-3way is slower than original assembler + * implementation because excessive uses of 64bit rotate and + * left-shifts (which are really slow on P4) needed to store and + * handle 128bit block in two 64bit registers. + */ + return true; + } + + return false; +} + +static int force; +module_param(force, int, 0); +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); + int __init init(void) { int err; + if (!force && is_blacklisted_cpu()) { + printk(KERN_INFO + "twofish-x86_64-3way: performance on this CPU " + "would be suboptimal: disabling " + "twofish-x86_64-3way.\n"); + return -ENODEV; + } + err = crypto_register_alg(&blk_ecb_alg); if (err) goto ecb_err; -- cgit v1.2.3 From 4c58464b8034cef4317593bf4ccbfc19d5bb3a77 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 20 Dec 2011 12:20:21 +0200 Subject: crypto: blowfish-x86_64 - blacklist Pentium 4 Implementation in blowfish-x86_64 uses 64bit rotations which are slow on P4, making blowfish-x86_64 slower than generic C implementation. Therefore blacklist P4. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index b05aa163d55a..2970110d2cea 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -25,6 +25,7 @@ * */ +#include #include #include #include @@ -446,10 +447,39 @@ static struct crypto_alg blk_ctr_alg = { }, }; +static bool is_blacklisted_cpu(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (boot_cpu_data.x86 == 0x0f) { + /* + * On Pentium 4, blowfish-x86_64 is slower than generic C + * implementation because use of 64bit rotates (which are really + * slow on P4). Therefore blacklist P4s. + */ + return true; + } + + return false; +} + +static int force; +module_param(force, int, 0); +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); + static int __init init(void) { int err; + if (!force && is_blacklisted_cpu()) { + printk(KERN_INFO + "blowfish-x86_64: performance on this CPU " + "would be suboptimal: disabling " + "blowfish-x86_64.\n"); + return -ENODEV; + } + err = crypto_register_alg(&bf_alg); if (err) goto bf_err; -- cgit v1.2.3 From 847cb7ef565d31484f426677e0bea081bfd2acd9 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 20 Dec 2011 12:58:06 +0200 Subject: crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating point instructions, which might cause performance penality on some CPUs. This patch replaces transpose_4x4 macro with version that uses only SSE2 integer instructions. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent-sse2-i586-asm_32.S | 29 +++++++++++++--------------- arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 29 +++++++++++++--------------- 2 files changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index 4e37677ca851..c00053d42f99 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -463,23 +463,20 @@ pand x0, x4; \ pxor x2, x4; -#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ - movdqa x2, t3; \ - movdqa x0, t1; \ - unpcklps x3, t3; \ +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ movdqa x0, t2; \ - unpcklps x1, t1; \ - unpckhps x1, t2; \ - movdqa t3, x1; \ - unpckhps x3, x2; \ - movdqa t1, x0; \ - movhlps t1, x1; \ - movdqa t2, t1; \ - movlhps t3, x0; \ - movlhps x2, t1; \ - movhlps t2, x2; \ - movdqa x2, x3; \ - movdqa t1, x2; + punpckldq x1, x0; \ + punpckhdq x1, t2; \ + movdqa x2, t1; \ + punpckhdq x3, x2; \ + punpckldq x3, t1; \ + movdqa x0, x1; \ + punpcklqdq t1, x0; \ + punpckhqdq t1, x1; \ + movdqa t2, x3; \ + punpcklqdq x2, t2; \ + punpckhqdq x2, x3; \ + movdqa t2, x2; #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ movdqu (0*4*4)(in), x0; \ diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 7f24a1540821..3ee1ff04d3e9 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -585,23 +585,20 @@ get_key(i, 1, RK1); \ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ -#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ - movdqa x2, t3; \ - movdqa x0, t1; \ - unpcklps x3, t3; \ +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ movdqa x0, t2; \ - unpcklps x1, t1; \ - unpckhps x1, t2; \ - movdqa t3, x1; \ - unpckhps x3, x2; \ - movdqa t1, x0; \ - movhlps t1, x1; \ - movdqa t2, t1; \ - movlhps t3, x0; \ - movlhps x2, t1; \ - movhlps t2, x2; \ - movdqa x2, x3; \ - movdqa t1, x2; + punpckldq x1, x0; \ + punpckhdq x1, t2; \ + movdqa x2, t1; \ + punpckhdq x3, x2; \ + punpckldq x3, t1; \ + movdqa x0, x1; \ + punpcklqdq t1, x0; \ + punpckhqdq t1, x1; \ + movdqa t2, x3; \ + punpcklqdq x2, t2; \ + punpckhqdq x2, x3; \ + movdqa t2, x2; #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ movdqu (0*4*4)(in), x0; \ -- cgit v1.2.3 From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 14 Jan 2012 08:11:31 +0530 Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem") changed how things are dealt with in the MCE subsystem. Some of the things that got broken due to this are CPU hotplug and suspend/hibernate. MCE uses per_cpu allocations of struct device. So, when a CPU goes offline and comes back online, in order to ensure that we start from a clean slate with respect to the MCE subsystem, zero out the entire per_cpu device structure to 0 before using it. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f22a9f7f6390..29ba3297e480 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&dev->kobj, 0, sizeof(struct kobject)); + memset(dev, 0, sizeof(struct device)); dev->id = cpu; dev->bus = &mce_subsys; -- cgit v1.2.3 From 8d973b624ece3b85cfae9474935795d034f72faf Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 15 Jan 2012 19:40:24 -0500 Subject: x86/kprobes: Fix typo transferred from Intel manual The arch/x86/lib/x86-opcode-map.txt file [used by the kprobes instruction decoder] contains the line: af: SCAS/W/D/Q rAX,Xv This is what the Intel manuals show, but it's not correct. The 'X' stands for: Memory addressed by the DS:rSI register pair (for example, MOVS, CMPS, OUTS, or LODS). On the other hand 'Y' means (also see the ae byte entry for SCASB): Memory addressed by the ES:rDI register pair (for example, MOVS, CMPS, INS, STOS, or SCAS). Signed-off-by: Ulrich Drepper Acked-by: Masami Hiramatsu Cc: yrl.pp-manager.tt@hitachi.com Link: http://lkml.kernel.org/r/CAOPLpQfytPyDEBF1Hbkpo7ovUerEsstVGxBr%3DEpDL-BKEMaqLA@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index a793da5e560e..8641bbb8e006 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -210,7 +210,9 @@ ab: STOS/W/D/Q Yv,rAX ac: LODS/B AL,Xb ad: LODS/W/D/Q rAX,Xv ae: SCAS/B AL,Yb -af: SCAS/W/D/Q rAX,Xv +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib -- cgit v1.2.3 From a1c611745c8c4e8996c1877d4e5d0fc95f227c38 Mon Sep 17 00:00:00 2001 From: "xiyou.wangcong@gmail.com" Date: Sun, 15 Jan 2012 20:02:17 +0800 Subject: x86/kprobes: Add arch/x86/tools/insn_sanity to .gitignore After compiling the kernel, I got: % git status # On branch master # Untracked files: # (use "git add ..." to include in what will be committed) # # arch/x86/tools/insn_sanity nothing added to commit but untracked files present (use "git add" to track) it should be added to .gitignore. Signed-off-by: WANG Cong Acked-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/1326628937-27609-1-git-send-email-xiyou.wangcong@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index 028079065af6..7cab8c08e6d1 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -1,3 +1,4 @@ boot/compressed/vmlinux tools/test_get_len +tools/insn_sanity -- cgit v1.2.3 From f10448689d95b9516c656ccd4078839e656656e7 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 11 Jan 2012 05:11:46 +0400 Subject: x86: Get rid of dubious one-bit signed bitfield This very noisy sparse warning appears on almost every file in the kernel: CHECK init/main.c arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield Sparse is right and this patch changes sig_on_uaccess_error and uaccess_err flags to unsigned type and thus fixes the warning. Signed-off-by: Anton Vorontsov Acked-by: Andy Lutomirski Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Dan Carpenter Link: http://lkml.kernel.org/r/20120111011146.GA30428@oksana.dev.rtsoft.ru Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 185b719ec61a..56a63ff7665e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,8 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int sig_on_uaccess_error:1; - int uaccess_err:1; /* uaccess failed */ + unsigned int sig_on_uaccess_error:1; + unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ -- cgit v1.2.3 From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 16 Jan 2012 14:40:28 -0800 Subject: mce: fix warning messages about static struct mce_device When suspending, there was a large list of warnings going something like: Device 'machinecheck1' does not have a release() function, it is broken and must be fixed This patch turns the static mce_devices into dynamically allocated, and properly frees them when they are removed from the system. It solves the warning messages on my laptop here. Reported-by: "Srivatsa S. Bhat" Reported-by: Linus Torvalds Tested-by: Djalal Harouni Cc: Kay Sievers Cc: Tony Luck Cc: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++------- 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f35ce43c1a77..6aefb14cbbc5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct device, mce_device); +extern struct device *mce_device[CONFIG_NR_CPUS]; /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 29ba3297e480..5a11ae2e9e91 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct device, mce_device); +struct device *mce_device[CONFIG_NR_CPUS]; __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = { static cpumask_var_t mce_device_initialized; +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + /* Per cpu device init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_device_create(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(dev, 0, sizeof(struct device)); + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; dev->id = cpu; dev->bus = &mce_subsys; + dev->release = &mce_device_release; err = device_register(dev); if (err) @@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); + mce_device[cpu] = dev; return 0; error2: @@ -2046,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev = mce_device[cpu]; int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); + mce_device[cpu] = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ba0b94a7e204..786e76a86322 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; + struct device *dev = mce_device[cpu]; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, - b->kobj, name); + err = sysfs_create_link(&dev->kobj, b->kobj, name); if (err) goto out; @@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); + b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) goto out_free; @@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_device, i).kobj, - b->kobj, name); + dev = mce_device[i]; + if (dev) + err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) goto out; @@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; + struct device *dev; char name[32]; int i = 0; @@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); + sysfs_remove_link(&mce_device[cpu]->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); + dev = mce_device[i]; + if (dev) + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.2.3 From da87c937e5a2374686edd58df06cfd5050b125fa Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:17:50 -0600 Subject: x86/UV2: Fix new UV2 hardware by using native UV2 broadcast mode Update the use of the Broadcast Assist Unit on SGI Altix UV2 to the use of native UV2 mode on new hardware (not the legacy mode). UV2 native mode has a different format for a broadcast message. We also need quick differentiaton between UV1 and UV2. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116211750.GA5767@sgi.com Cc: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 93 +++++++++++++++++++++++++++++++++++++--- arch/x86/platform/uv/tlb_uv.c | 88 +++++++++++++++++++++++++++---------- 2 files changed, 151 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 8e862aaf0d90..4a46b27ee9a0 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -65,7 +65,7 @@ * UV2: Bit 19 selects between * (0): 10 microsecond timebase and * (1): 80 microseconds - * we're using 655us, similar to UV1: 65 units of 10us + * we're using 560us, similar to UV1: 65 units of 10us */ #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) @@ -235,10 +235,10 @@ struct bau_msg_payload { /* - * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see table 4.2.3.0.1 in broacast_assist spec. */ -struct bau_msg_header { +struct uv1_bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ unsigned int base_dest_nasid:15; /* nasid of the first bit */ @@ -317,20 +317,88 @@ struct bau_msg_header { /* bits 127:107 */ }; +/* + * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see figure 9-2 of harp_sys.pdf + */ +struct uv2_bau_msg_header { + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 14:0 */ /* in uvhub map */ + unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ + /* bits 19:15 */ + unsigned int rsvd_1:1; /* must be zero */ + /* bit 20 */ + /* Address bits 59:21 */ + /* bits 25:2 of address (44:21) are payload */ + /* these next 24 bits become bytes 12-14 of msg */ + /* bits 28:21 land in byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ + /* bit 21 */ + unsigned int msg_type:3; /* software type of the + message */ + /* bits 24:22 */ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ + /* bit 25 */ + unsigned int payload_1:3; /* not currently used */ + /* bits 28:26 */ + + /* bits 36:29 land in byte 13 */ + unsigned int payload_2a:3; /* not currently used */ + unsigned int payload_2b:5; /* not currently used */ + /* bits 36:29 */ + + /* bits 44:37 land in byte 14 */ + unsigned int payload_3:8; /* not currently used */ + /* bits 44:37 */ + + unsigned int rsvd_2:7; /* reserved */ + /* bits 51:45 */ + unsigned int swack_flag:1; /* software acknowledge flag */ + /* bit 52 */ + unsigned int rsvd_3a:3; /* must be zero */ + unsigned int rsvd_3b:8; /* must be zero */ + unsigned int rsvd_3c:8; /* must be zero */ + unsigned int rsvd_3d:3; /* must be zero */ + /* bits 74:53 */ + unsigned int fairness:3; /* usually zero */ + /* bits 77:75 */ + + unsigned int sequence:16; /* message sequence number */ + /* bits 93:78 Suppl_A */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ + /* bit 94 */ + unsigned int multilevel:1; /* multi-level multicast + format */ + /* bit 95 */ + unsigned int rsvd_4:24; /* ordered / source node / + source subnode / aging + must be zero */ + /* bits 119:96 */ + unsigned int command:8; /* message type */ + /* bits 127:120 */ +}; + /* * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ struct bau_desc { - struct pnmask distribution; + struct pnmask distribution; /* * message template, consisting of header and payload: */ - struct bau_msg_header header; - struct bau_msg_payload payload; + union bau_msg_header { + struct uv1_bau_msg_header uv1_hdr; + struct uv2_bau_msg_header uv2_hdr; + } header; + + struct bau_msg_payload payload; }; -/* +/* UV1: * -payload-- ---------header------ * bytes 0-11 bits 41-56 bits 58-81 * A B (2) C (3) @@ -340,6 +408,16 @@ struct bau_desc { * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) * ------------payload queue----------- */ +/* UV2: + * -payload-- ---------header------ + * bytes 0-11 bits 70-78 bits 21-44 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ /* * The payload queue on the destination side is an array of these. @@ -511,6 +589,7 @@ struct bau_control { short osnode; short uvhub_cpu; short uvhub; + short uvhub_version; short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 5b552198f774..1341a2e06542 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -573,7 +573,7 @@ static int wait_completion(struct bau_desc *bau_desc, right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); else @@ -757,15 +757,22 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, { int seq_number = 0; int completion_stat = 0; + int uv1 = 0; long try = 0; unsigned long index; cycles_t time1; cycles_t time2; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; + struct uv1_bau_msg_header *uv1_hdr = NULL; + struct uv2_bau_msg_header *uv2_hdr = NULL; - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) { + uv1 = 1; uv1_throttle(hmaster, stat); + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; while (hmaster->uvhub_quiesce) cpu_relax(); @@ -773,14 +780,23 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, time1 = get_cycles(); do { if (try == 0) { - bau_desc->header.msg_type = MSG_REGULAR; + if (uv1) + uv1_hdr->msg_type = MSG_REGULAR; + else + uv2_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - bau_desc->header.msg_type = MSG_RETRY; + if (uv1) + uv1_hdr->msg_type = MSG_RETRY; + else + uv2_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } - bau_desc->header.sequence = seq_number; + if (uv1) + uv1_hdr->sequence = seq_number; + else + uv2_hdr->sequence = seq_number; index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); @@ -967,7 +983,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; + bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -1083,7 +1099,7 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_LEG_SHFT); + mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); @@ -1432,12 +1448,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; + int uv1 = 0; unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; struct bau_desc *bau_desc; struct bau_desc *bd2; + struct uv1_bau_msg_header *uv1_hdr; + struct uv2_bau_msg_header *uv2_hdr; struct bau_control *bcp; /* @@ -1451,6 +1470,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) gpa = uv_gpa(bau_desc); n = uv_gpa_to_gnode(gpa); m = uv_gpa_to_offset(gpa); + if (is_uv1_hub()) + uv1 = 1; /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1461,21 +1482,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.swack_flag = 1; - /* - * The base_dest_nasid set in the message header is the nasid - * of the first uvhub in the partition. The bit map will - * indicate destination pnode numbers relative to that base. - * They may not be consecutive if nasid striding is being used. - */ - bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - bd2->header.dest_subnodeid = UV_LB_SUBNODEID; - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ + if (uv1) { + uv1_hdr = &bd2->header.uv1_hdr; + uv1_hdr->swack_flag = 1; + /* + * The base_dest_nasid set in the message header + * is the nasid of the first uvhub in the partition. + * The bit map will indicate destination pnode numbers + * relative to that base. They may not be consecutive + * if nasid striding is being used. + */ + uv1_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv1_hdr->command = UV_NET_ENDPOINT_INTD; + uv1_hdr->int_both = 1; + /* + * all others need to be set to zero: + * fairness chaining multilevel count replied_to + */ + } else { + uv2_hdr = &bd2->header.uv2_hdr; + uv2_hdr->swack_flag = 1; + uv2_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_hdr->command = UV_NET_ENDPOINT_INTD; + } } for_each_present_cpu(cpu) { if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) @@ -1728,6 +1761,14 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = *smasterp; bcp->uvhub = bdp->uvhub; + if (is_uv1_hub()) + bcp->uvhub_version = 1; + else if (is_uv2_hub()) + bcp->uvhub_version = 2; + else { + printk(KERN_EMERG "uvhub version not 1 or 2\n"); + return 1; + } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { @@ -1867,7 +1908,8 @@ static int __init uv_bau_init(void) val = 1L << 63; write_gmmr_activation(pnode, val); mmr = 1; /* should be 1 to broadcast to both sockets */ - write_mmr_data_broadcast(pnode, mmr); + if (!is_uv1_hub()) + write_mmr_data_broadcast(pnode, mmr); } } -- cgit v1.2.3 From d059f9fa84a30e04279c6ff615e9e2cf3b260191 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:18:48 -0600 Subject: x86/UV2: Fix BAU destination timeout initialization Move the call to enable_timeouts() forward so that BAU_MISC_CONTROL is initialized before using it in calculate_destination_timeout(). Fix the calculation of a BAU destination timeout for UV2 (in calculate_destination_timeout()). Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116211848.GB5767@sgi.com Cc: Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 1341a2e06542..c425ff1a9cc3 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1617,14 +1617,14 @@ static int calculate_destination_timeout(void) ts_ns = base * mult1 * mult2; ret = ts_ns / 1000; } else { - /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) - mult1 = 80; + base = 80; else - mult1 = 10; - base = mmr_image & UV2_ACK_MASK; + base = 10; + mult1 = mmr_image & UV2_ACK_MASK; ret = mult1 * base; } return ret; @@ -1886,6 +1886,8 @@ static int __init uv_bau_init(void) uv_base_pnode = uv_blade_to_pnode(uvhub); } + enable_timeouts(); + if (init_per_cpu(nuvhubs, uv_base_pnode)) { nobau = 1; return 0; @@ -1896,7 +1898,6 @@ static int __init uv_bau_init(void) if (uv_blade_nr_possible_cpus(uvhub)) init_uvhub(uvhub, vector, uv_base_pnode); - enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { -- cgit v1.2.3 From c5d35d399e685acccc85a675e8765c26b2a9813a Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:19:47 -0600 Subject: x86/UV2: Work around BAU bug This patch implements a workaround for a UV2 hardware bug. The bug is a non-atomic update of a memory-mapped register. When hardware message delivery and software message acknowledge occur simultaneously the pending message acknowledge for the arriving message may be lost. This causes the sender's message status to stay busy. Part of the workaround is to not acknowledge a completed message until it is verified that no other message is actually using the resource that is mistakenly recorded in the completed message. Part of the workaround is to test for long elapsed time in such a busy condition, then handle it by using a spare sending descriptor. The stay-busy condition is eventually timed out by hardware, and then the original sending descriptor can be re-used. Most of that logic change is in keeping track of the current descriptor and the state of the spares. The occurrences of the workaround are added to the BAU statistics. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116211947.GC5767@sgi.com Cc: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 13 +- arch/x86/platform/uv/tlb_uv.c | 274 ++++++++++++++++++++++++++++++++++----- 2 files changed, 254 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 4a46b27ee9a0..1b82f7e87393 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -167,6 +167,7 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -463,7 +464,6 @@ struct bau_pq_entry { struct msg_desc { struct bau_pq_entry *msg; int msg_slot; - int swack_slot; struct bau_pq_entry *queue_first; struct bau_pq_entry *queue_last; }; @@ -517,6 +517,9 @@ struct ptc_stats { unsigned long s_retry_messages; /* retry broadcasts */ unsigned long s_bau_reenabled; /* for bau enable/disable */ unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -593,6 +596,8 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -610,6 +615,7 @@ struct bau_control { int cong_response_us; int cong_reps; int cong_period; + unsigned long clocks_per_100_usec; cycles_t period_time; long period_requests; struct hub_and_pnode *thp; @@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr) uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index c425ff1a9cc3..9010ca715c03 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; struct bau_pq_entry *msg; msg = mdp->msg; - if (!msg->canceled) { + if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; write_mmr_sw_ack(dw); } @@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { unsigned long mr; /* - * is the resource timed out? - * make everyone ignore the cancelled message. + * Is the resource timed out? + * Make everyone ignore the cancelled message. */ msg2->canceled = 1; stat->d_canceled++; @@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { short socket_ack_count = 0; short *sp; @@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp, if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) */ - reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp, do_acknowledge); } } @@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, /* * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; unsigned long descriptor_status2; descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; descriptor_status = (descriptor_status << 1) | descriptor_status2; return descriptor_status; } +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; + } + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + static int uv2_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; + long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); + } + } /* * descriptor_stat is still BUSY */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -563,14 +687,14 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_AS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } if (bcp->uvhub_version == 1) @@ -752,8 +876,7 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) { int seq_number = 0; int completion_stat = 0; @@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) { - uv1 = 1; + if (bcp->uvhub_version == 1) uv1_throttle(hmaster, stat); - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); do { - if (try == 0) { + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, record_send_stats(time1, time2, bcp, stat, completion_stat, try); if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ return 1; return 0; } @@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -996,12 +1126,85 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp)) return NULL; else return cpumask; } +/* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) +{ + struct bau_pq_entry *msg_next = msg + 1; + + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; + } + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. + */ + return; + } + + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); + + return; +} + /* * The BAU message interrupt comes here. (registered by set_intr_gate) * See entry_64.S @@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs) count++; msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - bau_process_message(&msgdesc, bcp); + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; if (msg > msgdesc.queue_last) @@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, - "disable enable\n"); + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); } return 0; } @@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode) write_mmr_payload_first(pnode, pn_first); write_mmr_payload_tail(pnode, first); write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables(void) bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); } } @@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); -- cgit v1.2.3 From 478c6e529e7bd7c6ef8994c55bd252c287c35893 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:20:50 -0600 Subject: x86/UV2: Remove stale no-resources test for UV2 BAU This patch removes an unnecessary test for a no-destination-resources-available condition that looks like a destination timeout in UV1, but is separately distinguishable in UV2. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116212050.GD5767@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 9010ca715c03..affea509c174 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -642,16 +642,6 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { stat->s_dtimeout++; ttm = get_cycles(); - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { -- cgit v1.2.3 From 88ed9dd7f63c3ae71c1984d99ee2dced0b386dea Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:21:46 -0600 Subject: x86/UV2: Ack BAU interrupt earlier This patch moves the ack of the BAU interrupt to the beginning of the interrupt handler so that there is less possibility of a lost interrupt and slower response to a shootdown message. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116212146.GE5767@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index affea509c174..4686bf1e56ec 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1218,6 +1218,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct ptc_stats *stat; struct msg_desc msgdesc; + ack_APIC_irq(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); @@ -1247,8 +1248,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs) stat->d_nomsg++; else if (count > 1) stat->d_multmsg++; - - ack_APIC_irq(); } /* -- cgit v1.2.3 From b54bd9be35f4084edb3eb9ee054a43f722a67483 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 16 Jan 2012 15:22:38 -0600 Subject: x86/UV2: Add accounting for BAU strong nacks This patch adds separate accounting of UV2 message "strong nack's" in the BAU statistics. Signed-off-by: Cliff Wickman Link: http://lkml.kernel.org/r/20120116212238.GF5767@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 + arch/x86/platform/uv/tlb_uv.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 1b82f7e87393..becf47b81735 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -483,6 +483,7 @@ struct ptc_stats { requests */ unsigned long s_stimeout; /* source side timeouts */ unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_strongnacks; /* number of strong nack's */ unsigned long s_time; /* time spent in sending side */ unsigned long s_retriesok; /* successful retries */ unsigned long s_ntargcpu; /* total number of cpu's diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 4686bf1e56ec..9be4cff00a2d 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -635,13 +635,15 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, * our message and its state will stay IDLE. */ if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { stat->s_stimeout++; return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { + stat->s_strongnacks++; + bcp->conseccompletes = 0; + return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { stat->s_dtimeout++; - ttm = get_cycles(); bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { @@ -1346,7 +1348,7 @@ static int ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); seq_printf(file, "resetp resett giveup sto bz throt swack recv rtime "); seq_printf(file, @@ -1364,10 +1366,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout); + stat->s_dtimeout, stat->s_strongnacks); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, -- cgit v1.2.3 From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 8 Dec 2011 11:25:49 +0800 Subject: ACPI, Record ACPI NVS regions Some firmware will access memory in ACPI NVS region via APEI. That is, instructions in APEI ERST/EINJ table will read/write ACPI NVS region. The original resource conflict checking in APEI code will check memory/ioport accessed by APEI via general resource management mechanism. But ACPI NVS region is marked as busy already, so that the false resource conflict will prevent APEI ERST/EINJ to work. To fix this, this patch record ACPI NVS regions, so that we can avoid request resources for memory region inside it. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 4 ++-- drivers/acpi/Makefile | 3 ++- drivers/acpi/nvs.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/acpi.h | 20 +++++++++++++------ 4 files changed, 70 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f076..51c3b186e5b9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index ecb26b4f29a0..c07f44f05f9d 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -20,11 +20,12 @@ obj-y += acpi.o \ # All the builtin files are in the "acpi." module_param namespace. acpi-y += osl.o utils.o reboot.o acpi-y += atomicio.o +acpi-y += nvs.o # sleep related files acpi-y += wakeup.o acpi-y += sleep.o -acpi-$(CONFIG_ACPI_SLEEP) += proc.o nvs.o +acpi-$(CONFIG_ACPI_SLEEP) += proc.o # diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c index 096787b43c96..7a2035fa8c71 100644 --- a/drivers/acpi/nvs.c +++ b/drivers/acpi/nvs.c @@ -15,6 +15,56 @@ #include #include +/* ACPI NVS regions, APEI may use it */ + +struct nvs_region { + __u64 phys_start; + __u64 size; + struct list_head node; +}; + +static LIST_HEAD(nvs_region_list); + +#ifdef CONFIG_ACPI_SLEEP +static int suspend_nvs_register(unsigned long start, unsigned long size); +#else +static inline int suspend_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +#endif + +int acpi_nvs_register(__u64 start, __u64 size) +{ + struct nvs_region *region; + + region = kmalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + region->phys_start = start; + region->size = size; + list_add_tail(®ion->node, &nvs_region_list); + + return suspend_nvs_register(start, size); +} + +int acpi_nvs_for_each_region(int (*func)(__u64 start, __u64 size, void *data), + void *data) +{ + int rc; + struct nvs_region *region; + + list_for_each_entry(region, &nvs_region_list, node) { + rc = func(region->phys_start, region->size, data); + if (rc) + return rc; + } + + return 0; +} + + +#ifdef CONFIG_ACPI_SLEEP /* * Platforms, like ACPI, may want us to save some memory used by them during * suspend and to restore the contents of this memory during the subsequent @@ -41,7 +91,7 @@ static LIST_HEAD(nvs_list); * things so that the data from page-aligned addresses in this region will * be copied into separate RAM pages. */ -int suspend_nvs_register(unsigned long start, unsigned long size) +static int suspend_nvs_register(unsigned long start, unsigned long size) { struct nvs_page *entry, *next; @@ -159,3 +209,4 @@ void suspend_nvs_restore(void) if (entry->data) memcpy(entry->kaddr, entry->data, entry->size); } +#endif diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6001b4da39dd..26b75442ff7a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -306,6 +306,11 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req); extern void acpi_early_init(void); +extern int acpi_nvs_register(__u64 start, __u64 size); + +extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), + void *data); + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -348,15 +353,18 @@ static inline int acpi_table_parse(char *id, { return -1; } -#endif /* !CONFIG_ACPI */ -#ifdef CONFIG_ACPI_SLEEP -int suspend_nvs_register(unsigned long start, unsigned long size); -#else -static inline int suspend_nvs_register(unsigned long a, unsigned long b) +static inline int acpi_nvs_register(__u64 start, __u64 size) { return 0; } -#endif + +static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), + void *data) +{ + return 0; +} + +#endif /* !CONFIG_ACPI */ #endif /*_LINUX_ACPI_H*/ -- cgit v1.2.3 From cd298f60a2451a16e0f077404bf69b62ec868733 Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 17 Jan 2012 04:20:31 -0500 Subject: ACPI, x86: Use SRAT table rev to use 8bit or 32bit PXM fields (x86/x86-64) In SRAT v1, we had 8bit proximity domain (PXM) fields; SRAT v2 provides 32bits for these. The new fields were reserved before. According to the ACPI spec, the OS must disregrard reserved fields. x86/x86-64 was rather inconsistent prior to this patch; it used 8 bits for the pxm field in cpu_affinity, but 32 bits in mem_affinity. This patch makes it consistent: Either use 8 bits consistently (SRAT rev 1 or lower) or 32 bits (SRAT rev 2 or higher). cc: x86@kernel.org Signed-off-by: Kurt Garloff Signed-off-by: Len Brown --- arch/x86/mm/srat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 81dbfdeb080d..7efd0c615d58 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); -- cgit v1.2.3 From 5ee71535440f034de1196b11f78cef81c4025c2b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Jan 2012 11:57:18 -0800 Subject: x86/kconfig: Move the ZONE_DMA entry under a menu Move the ZONE_DMA kconfig symbol under a menu item instead of having it listed before everything else in "make {xconfig | gconfig | nconfig | menuconfig}". This drops the first line of the top-level kernel config menu (in 3.2) below and moves it under "Processor type and features". [*] DMA memory allocation support General setup ---> [*] Enable loadable module support ---> [*] Enable the block layer ---> Processor type and features ---> Power management and ACPI options ---> Bus options (PCI etc.) ---> Executable file formats / Emulations ---> Signed-off-by: Randy Dunlap Acked-by: David Rientjes Cc: Linus Torvalds Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/4F14811E.6090107@xenotime.net Signed-off-by: Ingo Molnar Cc: David Rientjes --- arch/x86/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5731eb70e0a0..db190faffba1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -120,16 +120,6 @@ config HAVE_LATENCYTOP_SUPPORT config MMU def_bool y -config ZONE_DMA - bool "DMA memory allocation support" if EXPERT - default y - help - DMA memory allocation support allows devices with less than 32-bit - addressing to allocate within the first 16MB of address space. - Disable if no such devices will be used. - - If unsure, say Y. - config SBUS bool @@ -253,6 +243,16 @@ source "kernel/Kconfig.freezer" menu "Processor type and features" +config ZONE_DMA + bool "DMA memory allocation support" if EXPERT + default y + help + DMA memory allocation support allows devices with less than 32-bit + addressing to allocate within the first 16MB of address space. + Disable if no such devices will be used. + + If unsure, say Y. + source "kernel/time/Kconfig" config SMP -- cgit v1.2.3 From ce79dac861e0d9a473d9923391bdbaad83c1c57f Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 17 Jan 2012 14:14:02 -0500 Subject: x86, opcode: ANDN and Group 17 in x86-opcode-map.txt The Intel documentation at http://software.intel.com/file/36945 shows the ANDN opcode and Group 17 with encoding f2 and f3 encoding respectively. The current version of x86-opcode-map.txt shows them with f3 and f4. Unless someone can point to documentation which shows the currently used encoding the following patch be applied. Signed-off-by: Ulrich Drepper Link: http://lkml.kernel.org/r/CAOPLpQdq5SuVo9=023CYhbFLAX9rONyjmYq7jJkqc5xwctW5eA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/x86-opcode-map.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5b83c51c12e0..4c8010d4f5e6 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -729,8 +729,8 @@ de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) -f3: ANDN Gy,By,Ey (v) -f4: Grp17 (1A) +f2: ANDN Gy,By,Ey (v) +f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) f6: MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) -- cgit v1.2.3 From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris Acked-by: H. Peter Anvin [for x86 portion] Acked-by: Tony Luck [for ia64] Acked-by: Richard Weinberger [for uml] Acked-by: David S. Miller [for sparc] Acked-by: Ralf Baechle [for mips] Acked-by: Benjamin Herrenschmidt [for ppc] --- arch/ia64/include/asm/ptrace.h | 13 ++++++++++++- arch/ia64/kernel/ptrace.c | 9 +-------- arch/microblaze/include/asm/ptrace.h | 5 +++++ arch/microblaze/kernel/ptrace.c | 3 +-- arch/mips/include/asm/ptrace.h | 14 +++++++++++++- arch/mips/kernel/ptrace.c | 4 +--- arch/powerpc/include/asm/ptrace.h | 13 ++++++++++++- arch/powerpc/kernel/ptrace.c | 4 +--- arch/s390/include/asm/ptrace.h | 6 +++++- arch/s390/kernel/ptrace.c | 4 +--- arch/sh/include/asm/ptrace_32.h | 5 ++++- arch/sh/include/asm/ptrace_64.h | 5 ++++- arch/sh/kernel/ptrace_32.c | 4 +--- arch/sh/kernel/ptrace_64.c | 4 +--- arch/sparc/include/asm/ptrace.h | 10 +++++++++- arch/sparc/kernel/ptrace_64.c | 11 +---------- arch/um/kernel/ptrace.c | 4 ++-- arch/x86/ia32/ia32entry.S | 10 +++++----- arch/x86/kernel/entry_32.S | 8 ++++---- arch/x86/kernel/entry_64.S | 10 +++++----- arch/x86/kernel/ptrace.c | 3 +-- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/um/shared/sysdep/ptrace.h | 5 +++++ include/linux/audit.h | 22 ++++++++++++++-------- include/linux/ptrace.h | 10 ++++++++++ kernel/auditsc.c | 16 ++++++++++++---- 26 files changed, 132 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index f5cb27614e35..68c98f5b3ca6 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->ar_bspstore; } -#define regs_return_value(regs) ((regs)->r8) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return regs->r10 != -1; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->r8; + else + return -regs->r8; +} /* Conserve space in histogram by encoding slot bits in address * bits 2 and 3 rather than bits 0 and 1. diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 8848f43d819e..2c154088cce7 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, { int step; - if (unlikely(current->audit_context)) { - int success = AUDITSC_RESULT(regs.r10); - long result = regs.r8; - - if (success != AUDITSC_SUCCESS) - result = -result; - audit_syscall_exit(success, result); - } + audit_syscall_exit(®s); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h index 816bee64b196..94e92c805859 100644 --- a/arch/microblaze/include/asm/ptrace.h +++ b/arch/microblaze/include/asm/ptrace.h @@ -61,6 +61,11 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->pc) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->r3; +} + #else /* __KERNEL__ */ /* pt_regs offsets used by gdbserver etc in ptrace syscalls */ diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index 043cb58f9c44..f564b1bfd386 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3); + audit_syscall_exit(regs); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index de39b1f343ea..7d409505df2d 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child, */ #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER) -#define regs_return_value(_regs) ((_regs)->regs[2]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !regs->regs[7]; +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->regs[2]; + else + return -regs->regs[2]; +} + #define instruction_pointer(regs) ((regs)->cp0_epc) #define profile_pc(regs) instruction_pointer(regs) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 4e6ea1ffad46..ab0f1963a7bd 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -572,9 +572,7 @@ out: */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), - -regs->regs[2]); + audit_syscall_exit(regs); if (!(current->ptrace & PT_PTRACED)) return; diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 48223f9b8728..78a205162fd7 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -86,7 +86,18 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) #define kernel_stack_pointer(regs) ((regs)->gpr[1]) -#define regs_return_value(regs) ((regs)->gpr[3]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->ccr & 0x10000000); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->gpr[3]; + else + return -regs->gpr[3]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 5de73dbd15c7..09d31c12a5e3 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->result); diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 56da355678f4..aeb77f017985 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -541,9 +541,13 @@ struct user_regs_struct #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + int regs_query_register_offset(const char *name); const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 573bc29551ef..f52758600980 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), - regs->gprs[2]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->gprs[2]); diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h index 6c2239cca1a2..2d3e906aa722 100644 --- a/arch/sh/include/asm/ptrace_32.h +++ b/arch/sh/include/asm/ptrace_32.h @@ -76,7 +76,10 @@ struct pt_dspregs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tra) -#define regs_return_value(_regs) ((_regs)->regs[0]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[0]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h index bf9be7764d69..eb3fcceaf64b 100644 --- a/arch/sh/include/asm/ptrace_64.h +++ b/arch/sh/include/asm/ptrace_64.h @@ -13,7 +13,10 @@ struct pt_regs { #ifdef __KERNEL__ #define MAX_REG_OFFSET offsetof(struct pt_regs, tregs[7]) -#define regs_return_value(_regs) ((_regs)->regs[3]) +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[3]; +} #endif /* __KERNEL__ */ diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 92b3c276339a..c0b5c179d27b 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]), - regs->regs[0]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[0]); diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index c8f97649f354..ba720d686435 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs) { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]), - regs->regs[9]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->regs[9]); diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index a0e1bcf843a1..c00c3b5c2806 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -207,7 +207,15 @@ do { current_thread_info()->syscall_noerror = 1; \ #define instruction_pointer(regs) ((regs)->tpc) #define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) -#define regs_return_value(regs) ((regs)->u_regs[UREG_I0]) +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY)); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->u_regs[UREG_I0]; +} #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *); #else diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 96ee50a80661..c73c8c50f117 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) asmlinkage void syscall_trace_leave(struct pt_regs *regs) { -#ifdef CONFIG_AUDITSYSCALL - if (unlikely(current->audit_context)) { - unsigned long tstate = regs->tstate; - int result = AUDITSC_SUCCESS; + audit_syscall_exit(regs); - if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) - result = AUDITSC_FAILURE; - - audit_syscall_exit(result, regs->u_regs[UREG_I0]); - } -#endif if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->u_regs[UREG_G1]); diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index c9da32b0c707..2ccf25c42feb 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) UPT_SYSCALL_ARG2(regs), UPT_SYSCALL_ARG3(regs), UPT_SYSCALL_ARG4(regs)); - else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), - UPT_SYSCALL_RET(regs)); + else + audit_syscall_exit(regs); } /* Fake a debug trap */ diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3e274564f6bf..64ced0b8f8fd 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -208,12 +209,11 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21b4dd7..a22facf06f0e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include +#include #include #include #include @@ -466,11 +467,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc87..e51393dd93a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -563,17 +564,16 @@ auditsys: jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..8b0218758775 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0a..af17e1c966dc 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b1621747f..5ef9344a8b24 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -3,3 +3,8 @@ #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} diff --git a/include/linux/audit.h b/include/linux/audit.h index 6e1c533f9b46..3d65e4b3ba06 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -26,6 +26,7 @@ #include #include +#include /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system @@ -408,10 +409,6 @@ struct audit_field { void *lsm_rule; }; -#define AUDITSC_INVALID 0 -#define AUDITSC_SUCCESS 1 -#define AUDITSC_FAILURE 2 -#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS ) extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); @@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task); extern void audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); -extern void audit_syscall_exit(int failed, long return_code); +extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct dentry *dentry); @@ -438,6 +435,15 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_syscall_exit(void *pt_regs) +{ + if (unlikely(current->audit_context)) { + int success = is_syscall_success(pt_regs); + int return_code = regs_return_value(pt_regs); + + __audit_syscall_exit(success, return_code); + } +} static inline void audit_getname(const char *name) { if (unlikely(!audit_dummy_context())) @@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags) extern int audit_n_rules; extern int audit_signals; -#else +#else /* CONFIG_AUDITSYSCALL */ #define audit_finish_fork(t) #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) -#define audit_syscall_exit(f,r) do { ; } while (0) +#define audit_syscall_exit(r) do { ; } while (0) #define audit_dummy_context() 1 #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) @@ -587,7 +593,7 @@ extern int audit_signals; #define audit_ptrace(t) ((void)0) #define audit_n_rules 0 #define audit_signals 0 -#endif +#endif /* CONFIG_AUDITSYSCALL */ #ifdef CONFIG_AUDIT /* These are defined in audit.c */ diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 800f113bea66..dd4cefa6519d 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -112,6 +112,7 @@ #include /* For unlikely. */ #include /* For struct task_struct. */ +#include /* for IS_ERR_VALUE */ extern long arch_ptrace(struct task_struct *child, long request, @@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task) #define force_successful_syscall_return() do { } while (0) #endif +#ifndef is_syscall_success +/* + * On most systems we can tell if a syscall is a success based on if the retval + * is an error value. On some systems like ia64 and powerpc they have different + * indicators of success/failure and must define their own. + */ +#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) +#endif + /* * should define the following things inside #ifdef __KERNEL__. * diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e9bcb93800d8..3d2853808185 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,6 +70,11 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child) /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @pt_regs: syscall registers * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from @@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child) * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; + context = audit_get_context(tsk, success, return_code); if (likely(!context)) return; -- cgit v1.2.3 From f031cd25568a390dc2c9c3a4015054183753449a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: ia32entry.S sign extend error codes when calling 64 bit code In the ia32entry syscall exit audit fastpath we have assembly code which calls __audit_syscall_exit directly. This code was, however, zeroes the upper 32 bits of the return code. It then proceeded to call code which expects longs to be 64bits long. In order to handle code which expects longs to be 64bit we sign extend the return code if that code is an error. Thus the __audit_syscall_exit function can correctly handle using the values in snprintf("%ld"). This fixes the regression introduced in 5cbf1565f29eb57a86a. Old record: type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=4294967283 New record: type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=-13 Signed-off-by: Eric Paris Acked-by: H. Peter Anvin --- arch/x86/ia32/ia32entry.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 64ced0b8f8fd..025f0f01d254 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -210,7 +210,9 @@ sysexit_from_sys_call: sti movl %eax,%esi /* second arg, syscall return value */ cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ call __audit_syscall_exit movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ -- cgit v1.2.3 From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: inline audit_syscall_entry to reduce burden on archs Every arch calls: if (unlikely(current->audit_context)) audit_syscall_entry() which requires knowledge about audit (the existance of audit_context) in the arch code. Just do it all in static inline in audit.h so that arch's can remain blissfully ignorant. Signed-off-by: Eric Paris --- arch/ia64/kernel/ptrace.c | 9 +-------- arch/microblaze/kernel/ptrace.c | 6 ++---- arch/mips/kernel/ptrace.c | 7 +++---- arch/powerpc/kernel/ptrace.c | 26 ++++++++++++-------------- arch/s390/kernel/ptrace.c | 11 +++++------ arch/sh/kernel/ptrace_32.c | 7 +++---- arch/sh/kernel/ptrace_64.c | 7 +++---- arch/sparc/kernel/ptrace_64.c | 17 ++++++++--------- arch/um/kernel/ptrace.c | 20 +++++++++----------- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/ptrace.c | 22 ++++++++++------------ arch/xtensa/kernel/ptrace.c | 3 +-- include/linux/audit.h | 13 ++++++++++--- kernel/auditsc.c | 2 +- 16 files changed, 72 insertions(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 2c154088cce7..dad91661ddf9 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, if (test_thread_flag(TIF_RESTORE_RSE)) ia64_sync_krbs(); - if (unlikely(current->audit_context)) { - long syscall; - int arch; - syscall = regs.r15; - arch = AUDIT_ARCH_IA64; - - audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); - } + audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3); return 0; } diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index f564b1bfd386..6eb2aa927d89 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) */ ret = -1L; - if (unlikely(current->audit_context)) - audit_syscall_entry(EM_MICROBLAZE, regs->r12, - regs->r5, regs->r6, - regs->r7, regs->r8); + audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6, + regs->r7, regs->r8); return ret ?: regs->r12; } diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index ab0f1963a7bd..7786b608d932 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) } out: - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[2], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[2], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); } /* diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 09d31c12a5e3..5b43325402bc 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gpr[0]); - if (unlikely(current->audit_context)) { #ifdef CONFIG_PPC64 - if (!is_32bit_task()) - audit_syscall_entry(AUDIT_ARCH_PPC64, - regs->gpr[0], - regs->gpr[3], regs->gpr[4], - regs->gpr[5], regs->gpr[6]); - else + if (!is_32bit_task()) + audit_syscall_entry(AUDIT_ARCH_PPC64, + regs->gpr[0], + regs->gpr[3], regs->gpr[4], + regs->gpr[5], regs->gpr[6]); + else #endif - audit_syscall_entry(AUDIT_ARCH_PPC, - regs->gpr[0], - regs->gpr[3] & 0xffffffff, - regs->gpr[4] & 0xffffffff, - regs->gpr[5] & 0xffffffff, - regs->gpr[6] & 0xffffffff); - } + audit_syscall_entry(AUDIT_ARCH_PPC, + regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); return ret ?: regs->gpr[0]; } diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index f52758600980..9d82ed4bcb27 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gprs[2]); - if (unlikely(current->audit_context)) - audit_syscall_entry(is_compat_task() ? - AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, - regs->gprs[2], regs->orig_gpr2, - regs->gprs[3], regs->gprs[4], - regs->gprs[5]); + audit_syscall_entry(is_compat_task() ? + AUDIT_ARCH_S390 : AUDIT_ARCH_S390X, + regs->gprs[2], regs->orig_gpr2, + regs->gprs[3], regs->gprs[4], + regs->gprs[5]); return ret ?: regs->gprs[2]; } diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index c0b5c179d27b..a3e651563763 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[0]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[3], - regs->regs[4], regs->regs[5], - regs->regs[6], regs->regs[7]); + audit_syscall_entry(audit_arch(), regs->regs[3], + regs->regs[4], regs->regs[5], + regs->regs[6], regs->regs[7]); return ret ?: regs->regs[0]; } diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index ba720d686435..3d0080b5c976 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[9]); - if (unlikely(current->audit_context)) - audit_syscall_entry(audit_arch(), regs->regs[1], - regs->regs[2], regs->regs[3], - regs->regs[4], regs->regs[5]); + audit_syscall_entry(audit_arch(), regs->regs[1], + regs->regs[2], regs->regs[3], + regs->regs[4], regs->regs[5]); return ret ?: regs->regs[9]; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index c73c8c50f117..9388844cd88c 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->u_regs[UREG_G1]); - if (unlikely(current->audit_context) && !ret) - audit_syscall_entry((test_thread_flag(TIF_32BIT) ? - AUDIT_ARCH_SPARC : - AUDIT_ARCH_SPARC64), - regs->u_regs[UREG_G1], - regs->u_regs[UREG_I0], - regs->u_regs[UREG_I1], - regs->u_regs[UREG_I2], - regs->u_regs[UREG_I3]); + audit_syscall_entry((test_thread_flag(TIF_32BIT) ? + AUDIT_ARCH_SPARC : + AUDIT_ARCH_SPARC64), + regs->u_regs[UREG_G1], + regs->u_regs[UREG_I0], + regs->u_regs[UREG_I1], + regs->u_regs[UREG_I2], + regs->u_regs[UREG_I3]); return ret; } diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 2ccf25c42feb..06b190390505 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit) int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit; int tracesysgood; - if (unlikely(current->audit_context)) { - if (!entryexit) - audit_syscall_entry(HOST_AUDIT_ARCH, - UPT_SYSCALL_NR(regs), - UPT_SYSCALL_ARG1(regs), - UPT_SYSCALL_ARG2(regs), - UPT_SYSCALL_ARG3(regs), - UPT_SYSCALL_ARG4(regs)); - else - audit_syscall_exit(regs); - } + if (!entryexit) + audit_syscall_entry(HOST_AUDIT_ARCH, + UPT_SYSCALL_NR(regs), + UPT_SYSCALL_ARG1(regs), + UPT_SYSCALL_ARG2(regs), + UPT_SYSCALL_ARG3(regs), + UPT_SYSCALL_ARG4(regs)); + else + audit_syscall_exit(regs); /* Fake a debug trap */ if (is_singlestep) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 025f0f01d254..cecfd9a8f734 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -192,7 +192,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a22facf06f0e..1ccd742eba1b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -456,7 +456,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e51393dd93a3..1ca66b650123 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -549,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -559,7 +559,7 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8b0218758775..50267386b766 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index a0d042aa2967..2dff698ab02e 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs) do_syscall_trace(); #if 0 - if (unlikely(current->audit_context)) - audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); + audit_syscall_entry(current, AUDIT_ARCH_XTENSA..); #endif } diff --git a/include/linux/audit.h b/include/linux/audit.h index 3d65e4b3ba06..f56ce2669b83 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch); extern void audit_finish_fork(struct task_struct *child); extern int audit_alloc(struct task_struct *task); extern void audit_free(struct task_struct *task); -extern void audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3); +extern void __audit_syscall_entry(int arch, + int major, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); @@ -435,6 +435,13 @@ static inline int audit_dummy_context(void) void *p = current->audit_context; return !p || *(int *)p; } +static inline void audit_syscall_entry(int arch, int major, unsigned long a0, + unsigned long a1, unsigned long a2, + unsigned long a3) +{ + if (unlikely(!audit_dummy_context())) + __audit_syscall_entry(arch, major, a0, a1, a2, a3); +} static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(current->audit_context)) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3d2853808185..b408100dd6ef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { -- cgit v1.2.3 From 68f30fbee19cc67849b9fa8e153ede70758afe81 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Jan 2012 15:35:37 -0800 Subject: x86, tsc: Fix SMI induced variation in quick_pit_calibrate() pit_expect_msb() returns success wrongly in the below SMI scenario: a. pit_verify_msb() has not yet seen the MSB transition. b. we are close to the MSB transition though and got a SMI immediately after returning from pit_verify_msb() which didn't see the MSB transition. PIT MSB transition has happened somewhere during SMI execution. c. returned from SMI and we noted down the 'tsc', saw the pit MSB change now and exited the loop to calculate 'deltatsc'. Instead of noting the TSC at the MSB transition, we are way off because of the SMI. And as the SMI happened between the pit_verify_msb() and before the 'tsc' is recorded in the for loop, 'delattsc' (d1/d2 in quick_pit_calibrate()) will be small and quick_pit_calibrate() will not notice this error. Depending on whether SMI disturbance happens while computing d1 or d2, we will see the TSC calibrated value smaller or bigger than the expected value. As a result, in a cluster we were seeing a variation of approximately +/- 20MHz in the calibrated values, resulting in NTP failures. [ As far as the SMI source is concerned, this is a periodic SMI that gets disabled after ACPI is enabled by the OS. But the TSC calibration happens before the ACPI is enabled. ] To address this, change pit_expect_msb() so that - the 'tsc' is the TSC in between the two reads that read the MSB change from the PIT (same as before) - the 'delta' is the difference in TSC from *before* the MSB changed to *after* the MSB changed. Now the delta is twice as big as before (it covers four PIT accesses, roughly 4us) and quick_pit_calibrate() will loop a bit longer to get the calibrated value with in the 500ppm precision. As the delta (d1/d2) covers four PIT accesses, actual calibrated result might be closer to 250ppm precision. As the loop now takes longer to stabilize, double MAX_QUICK_PIT_MS to 50. SMI disturbance will showup as much larger delta's and the loop will take longer than usual for the result to be with in the accepted precision. Or will fallback to slow PIT calibration if it takes more than 50msec. Also while we are at this, remove the calibration correction that aims to get the result to the middle of the error bars. We really don't know which direction to correct into, so remove it. Reported-and-tested-by: Suresh Siddha Signed-off-by: Linus Torvalds Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1326843337.5291.4.camel@sbsiddha-mobl2 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tsc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2c9cf0fd78f5..f54694611172 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val) static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; - u64 tsc = 0; + u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; + prev_tsc = tsc; tsc = get_cycles(); } - *deltap = get_cycles() - tsc; + *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* @@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it. */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) @@ -383,15 +384,12 @@ success: * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are - * reliable (within the error). We also adjust the - * delta to the middle of the error bars, just - * because it looks nicer. + * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ - delta += (long)(d2 - d1)/2; delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); printk("Fast TSC calibration using PIT\n"); -- cgit v1.2.3 From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Jan 2012 01:51:22 +0000 Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n JONGMAN HEO reports: With current linus git (commit a25a2b84), I got following build error, arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86': arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit' make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1 OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n) It's due to commit d7e7528bcd45: "Audit: push audit success and retcode into arch ptrace.h". Reported-by: JONGMAN HEO Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index af17e1c966dc..b466cab5ba15 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk mark_screen_rdonly(tsk->mm); /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" -- cgit v1.2.3 From d00a9dd21bdf7908b70866794c8313ee8a5abd5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jan 2012 07:21:42 +0000 Subject: net: bpf_jit: fix divide by 0 generation Several problems fixed in this patch : 1) Target of the conditional jump in case a divide by 0 is performed by a bpf is wrong. 2) Must 'generate' the full function prologue/epilogue at pass=0, or else we can stop too early in pass=1 if the proglen doesnt change. (if the increase of prologue/epilogue equals decrease of all instructions length because some jumps are converted to near jumps) 3) Change the wrong length detection at the end of code generation to issue a more explicit message, no need for a full stack trace. Reported-by: Phil Oester Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7b65f752c5f8..7c1b765ecc59 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -151,17 +151,18 @@ void bpf_jit_compile(struct sk_filter *fp) cleanup_addr = proglen; /* epilogue address */ for (pass = 0; pass < 10; pass++) { + u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen; /* no prologue/epilogue for trivial filters (RET something) */ proglen = 0; prog = temp; - if (seen) { + if (seen_or_pass0) { EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ /* note : must save %rbx in case bpf_error is hit */ - if (seen & (SEEN_XREG | SEEN_DATAREF)) + if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF)) EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) CLEAR_X(); /* make sure we dont leek kernel memory */ /* @@ -170,7 +171,7 @@ void bpf_jit_compile(struct sk_filter *fp) * r9 = skb->len - skb->data_len * r8 = skb->data */ - if (seen & SEEN_DATAREF) { + if (seen_or_pass0 & SEEN_DATAREF) { if (offsetof(struct sk_buff, len) <= 127) /* mov off8(%rdi),%r9d */ EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); @@ -260,9 +261,14 @@ void bpf_jit_compile(struct sk_filter *fp) case BPF_S_ALU_DIV_X: /* A /= X; */ seen |= SEEN_XREG; EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ - if (pc_ret0 != -1) - EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); - else { + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is start address of target + * (addrs[i] - 4) is the address following this jmp + * ("xor %edx,%edx; div %ebx" being 4 bytes long) + */ + EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - + (addrs[i] - 4)); + } else { EMIT_COND_JMP(X86_JNE, 2 + 5); CLEAR_A(); EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ @@ -335,12 +341,12 @@ void bpf_jit_compile(struct sk_filter *fp) } /* fallinto */ case BPF_S_RET_A: - if (seen) { + if (seen_or_pass0) { if (i != flen - 1) { EMIT_JMP(cleanup_addr - addrs[i]); break; } - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ EMIT1(0xc9); /* leaveq */ } @@ -483,8 +489,9 @@ common_load: seen |= SEEN_DATAREF; goto common_load; case BPF_S_LDX_B_MSH: if ((int)K < 0) { - if (pc_ret0 != -1) { - EMIT_JMP(addrs[pc_ret0] - addrs[i]); + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is the start address */ + EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]); break; } CLEAR_A(); @@ -599,13 +606,14 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; * use it to give the cleanup instruction(s) addr */ cleanup_addr = proglen - 1; /* ret */ - if (seen) + if (seen_or_pass0) cleanup_addr -= 1; /* leaveq */ - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ if (image) { - WARN_ON(proglen != oldproglen); + if (proglen != oldproglen) + pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen); break; } if (proglen == oldproglen) { -- cgit v1.2.3 From 90a4c0f51e8e44111a926be6f4c87af3938a79c3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Jan 2012 19:26:11 -0800 Subject: uml: fix compile for x86-64 Randy Dunlap reports that we get arch/x86/um/shared/sysdep/ptrace.h:7:20: error: redefinition of 'regs_return_value' arch/x86/um/shared/sysdep/ptrace.h:7:20: note: previous definition of 'regs_return_value' was here when compiling UML for x86-64. Stephen Rothwell root-caused it and says: "Caused by commit d7e7528bcd45 ("Audit: push audit success and retcode into arch ptrace.h") (another patch that was never in linux-next :-(). This file now needs protection against double inclusion." so let's do as the man says. Reported-by: Randy Dunlap Analyzed-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- arch/x86/um/shared/sysdep/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 5ef9344a8b24..2bbe1ec2d96a 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,3 +1,6 @@ +#ifndef __SYSDEP_X86_PTRACE_H +#define __SYSDEP_X86_PTRACE_H + #ifdef __i386__ #include "ptrace_32.h" #else @@ -8,3 +11,5 @@ static inline long regs_return_value(struct uml_pt_regs *regs) { return UPT_SYSCALL_RET(regs); } + +#endif /* __SYSDEP_X86_PTRACE_H */ -- cgit v1.2.3 From 4f2f81a5621de47d42476d0b929be2e0d565df84 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Jan 2012 12:41:25 -0800 Subject: x86, syscall: Need __ARCH_WANT_SYS_IPC for 32 bits In checkin 303395ac3bf3 x86: Generate system call tables and unistd_*.h from tables the feature macros in were unified between 32 and 64 bits. Unfortunately 32 bits requires __ARCH_WANT_SYS_IPC and this was inadvertently dropped. Reported-by: Dmitry Kasatkin Cc: Linus Torvalds Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/CALLzPKbeXN5gdngo8uYYU8mAow=XhrwBFBhKfG811f37BubQOg@mail.gmail.com --- arch/x86/include/asm/unistd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index b4a3db7ce140..21f77b89e47a 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -7,6 +7,7 @@ # include # define __ARCH_WANT_IPC_PARSE_VERSION # define __ARCH_WANT_STAT64 +# define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT -- cgit v1.2.3 From 819165fb34b9777f852429f2c6d6f79fbb71b9eb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Jan 2012 16:21:41 +0000 Subject: x86: Adjust asm constraints in atomic64 wrappers Eric pointed out overly restrictive constraints in atomic64_set(), but there are issues throughout the file. In the cited case, %ebx and %ecx are inputs only (don't get changed by either of the two low level implementations). This was also the case elsewhere. Further in many cases early-clobber indicators were missing. Finally, the previous implementation rolled a custom alternative instruction macro from scratch, rather than using alternative_call() (which was introduced with the commit that the description of the change in question actually refers to). Adjusting has the benefit of not hiding referenced symbols from the compiler, which however requires them to be declared not just in the exporting source file (which, as a desirable side effect, in turn allows that exporting file to become a real 5-line stub). This patch does not eliminate the overly restrictive memory clobbers, however: Doing so would occasionally make the compiler set up a second register for accessing the memory object (to satisfy the added "m" constraint), and it's not clear which of the two non-optimal alternatives is better. v2: Re-do the declaration and exporting of the internal symbols. Reported-by: Eric Dumazet Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F19A2A5020000780006E0D9@nat28.tlf.novell.com Cc: Luca Barbieri Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 6 ++ arch/x86/include/asm/atomic64_32.h | 147 ++++++++++++++++++++----------------- arch/x86/lib/atomic64_32.c | 59 +-------------- 3 files changed, 88 insertions(+), 124 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 37ad100a2210..49331bedc158 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -145,6 +145,12 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_OUTPUT2(a...) a +/* + * use this macro if you need clobbers but no inputs in + * alternative_{input,io,call}() + */ +#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr + struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT void apply_paravirt(struct paravirt_patch_site *start, diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index fa13f0ec2874..908303f68bba 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,13 +14,52 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) +#ifndef ATOMIC64_EXPORT +#define ATOMIC64_DECL_ONE __ATOMIC64_DECL +#else +#define ATOMIC64_DECL_ONE(sym) __ATOMIC64_DECL(sym); \ + ATOMIC64_EXPORT(atomic64_##sym) +#endif + #ifdef CONFIG_X86_CMPXCHG64 -#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8" +#define __alternative_atomic64(f, g, out, in...) \ + asm volatile("call %P[func]" \ + : out : [func] "i" (atomic64_##g##_cx8), ## in) + +#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8) +#define __alternative_atomic64(f, g, out, in...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) + +#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ + ATOMIC64_DECL_ONE(sym##_386) + +ATOMIC64_DECL_ONE(add_386); +ATOMIC64_DECL_ONE(sub_386); +ATOMIC64_DECL_ONE(inc_386); +ATOMIC64_DECL_ONE(dec_386); #endif -#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f) +#define alternative_atomic64(f, out, in...) \ + __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) + +ATOMIC64_DECL(read); +ATOMIC64_DECL(set); +ATOMIC64_DECL(xchg); +ATOMIC64_DECL(add_return); +ATOMIC64_DECL(sub_return); +ATOMIC64_DECL(inc_return); +ATOMIC64_DECL(dec_return); +ATOMIC64_DECL(dec_if_positive); +ATOMIC64_DECL(inc_not_zero); +ATOMIC64_DECL(add_unless); + +#undef ATOMIC64_DECL +#undef ATOMIC64_DECL_ONE +#undef __ATOMIC64_DECL +#undef ATOMIC64_EXPORT /** * atomic64_cmpxchg - cmpxchg atomic64 variable @@ -50,11 +89,9 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) long long o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - asm volatile(ATOMIC64_ALTERNATIVE(xchg) - : "=A" (o), "+b" (low), "+c" (high) - : "S" (v) - : "memory" - ); + alternative_atomic64(xchg, "=&A" (o), + "S" (v), "b" (low), "c" (high) + : "memory"); return o; } @@ -69,11 +106,9 @@ static inline void atomic64_set(atomic64_t *v, long long i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - asm volatile(ATOMIC64_ALTERNATIVE(set) - : "+b" (low), "+c" (high) - : "S" (v) - : "eax", "edx", "memory" - ); + alternative_atomic64(set, /* no output */, + "S" (v), "b" (low), "c" (high) + : "eax", "edx", "memory"); } /** @@ -85,10 +120,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) static inline long long atomic64_read(const atomic64_t *v) { long long r; - asm volatile(ATOMIC64_ALTERNATIVE(read) - : "=A" (r), "+c" (v) - : : "memory" - ); + alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; } @@ -101,10 +133,9 @@ static inline long long atomic64_read(const atomic64_t *v) */ static inline long long atomic64_add_return(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE(add_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + alternative_atomic64(add_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -113,32 +144,25 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) */ static inline long long atomic64_sub_return(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE(sub_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + alternative_atomic64(sub_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } static inline long long atomic64_inc_return(atomic64_t *v) { long long a; - asm volatile(ATOMIC64_ALTERNATIVE(inc_return) - : "=A" (a) - : "S" (v) - : "memory", "ecx" - ); + alternative_atomic64(inc_return, "=&A" (a), + "S" (v) : "memory", "ecx"); return a; } static inline long long atomic64_dec_return(atomic64_t *v) { long long a; - asm volatile(ATOMIC64_ALTERNATIVE(dec_return) - : "=A" (a) - : "S" (v) - : "memory", "ecx" - ); + alternative_atomic64(dec_return, "=&A" (a), + "S" (v) : "memory", "ecx"); return a; } @@ -151,10 +175,9 @@ static inline long long atomic64_dec_return(atomic64_t *v) */ static inline long long atomic64_add(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + __alternative_atomic64(add, add_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -167,10 +190,9 @@ static inline long long atomic64_add(long long i, atomic64_t *v) */ static inline long long atomic64_sub(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + __alternative_atomic64(sub, sub_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -196,10 +218,8 @@ static inline int atomic64_sub_and_test(long long i, atomic64_t *v) */ static inline void atomic64_inc(atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return) - : : "S" (v) - : "memory", "eax", "ecx", "edx" - ); + __alternative_atomic64(inc, inc_return, /* no output */, + "S" (v) : "memory", "eax", "ecx", "edx"); } /** @@ -210,10 +230,8 @@ static inline void atomic64_inc(atomic64_t *v) */ static inline void atomic64_dec(atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return) - : : "S" (v) - : "memory", "eax", "ecx", "edx" - ); + __alternative_atomic64(dec, dec_return, /* no output */, + "S" (v) : "memory", "eax", "ecx", "edx"); } /** @@ -263,15 +281,16 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v) * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as it was not @u. - * Returns the old value of @v. + * Returns non-zero if the add was done, zero otherwise. */ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); - asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t" - : "+A" (a), "+c" (v), "+S" (low), "+D" (high) - : : "memory"); + alternative_atomic64(add_unless, + ASM_OUTPUT2("+A" (a), "+c" (v), + "+S" (low), "+D" (high)), + ASM_NO_INPUT_CLOBBER("memory")); return (int)a; } @@ -279,26 +298,20 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) static inline int atomic64_inc_not_zero(atomic64_t *v) { int r; - asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero) - : "=a" (r) - : "S" (v) - : "ecx", "edx", "memory" - ); + alternative_atomic64(inc_not_zero, "=&a" (r), + "S" (v) : "ecx", "edx", "memory"); return r; } static inline long long atomic64_dec_if_positive(atomic64_t *v) { long long r; - asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive) - : "=A" (r) - : "S" (v) - : "ecx", "memory" - ); + alternative_atomic64(dec_if_positive, "=&A" (r), + "S" (v) : "ecx", "memory"); return r; } -#undef ATOMIC64_ALTERNATIVE -#undef ATOMIC64_ALTERNATIVE_ +#undef alternative_atomic64 +#undef __alternative_atomic64 #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 042f6826bf57..a0b4a350daa7 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -1,59 +1,4 @@ -#include -#include -#include +#define ATOMIC64_EXPORT EXPORT_SYMBOL -#include -#include +#include #include - -long long atomic64_read_cx8(long long, const atomic64_t *v); -EXPORT_SYMBOL(atomic64_read_cx8); -long long atomic64_set_cx8(long long, const atomic64_t *v); -EXPORT_SYMBOL(atomic64_set_cx8); -long long atomic64_xchg_cx8(long long, unsigned high); -EXPORT_SYMBOL(atomic64_xchg_cx8); -long long atomic64_add_return_cx8(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_add_return_cx8); -long long atomic64_sub_return_cx8(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_sub_return_cx8); -long long atomic64_inc_return_cx8(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_inc_return_cx8); -long long atomic64_dec_return_cx8(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_dec_return_cx8); -long long atomic64_dec_if_positive_cx8(atomic64_t *v); -EXPORT_SYMBOL(atomic64_dec_if_positive_cx8); -int atomic64_inc_not_zero_cx8(atomic64_t *v); -EXPORT_SYMBOL(atomic64_inc_not_zero_cx8); -int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u); -EXPORT_SYMBOL(atomic64_add_unless_cx8); - -#ifndef CONFIG_X86_CMPXCHG64 -long long atomic64_read_386(long long, const atomic64_t *v); -EXPORT_SYMBOL(atomic64_read_386); -long long atomic64_set_386(long long, const atomic64_t *v); -EXPORT_SYMBOL(atomic64_set_386); -long long atomic64_xchg_386(long long, unsigned high); -EXPORT_SYMBOL(atomic64_xchg_386); -long long atomic64_add_return_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_add_return_386); -long long atomic64_sub_return_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_sub_return_386); -long long atomic64_inc_return_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_inc_return_386); -long long atomic64_dec_return_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_dec_return_386); -long long atomic64_add_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_add_386); -long long atomic64_sub_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_sub_386); -long long atomic64_inc_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_inc_386); -long long atomic64_dec_386(long long a, atomic64_t *v); -EXPORT_SYMBOL(atomic64_dec_386); -long long atomic64_dec_if_positive_386(atomic64_t *v); -EXPORT_SYMBOL(atomic64_dec_if_positive_386); -int atomic64_inc_not_zero_386(atomic64_t *v); -EXPORT_SYMBOL(atomic64_inc_not_zero_386); -int atomic64_add_unless_386(atomic64_t *v, long long a, long long u); -EXPORT_SYMBOL(atomic64_add_unless_386); -#endif -- cgit v1.2.3 From cb8095bba6d24118135a5683a956f4f4fb5f17bb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Jan 2012 16:22:04 +0000 Subject: x86: atomic64 assembly improvements In the "xchg" implementation, %ebx and %ecx don't need to be copied into %eax and %edx respectively (this is only necessary when desiring to only read the stored value). In the "add_unless" implementation, swapping the use of %ecx and %esi for passing arguments allows %esi to become an input only (i.e. permitting the register to be re-used to address the same object without reload). In "{add,sub}_return", doing the initial read64 through the passed in %ecx decreases a register dependency. In "inc_not_zero", a branch can be eliminated by or-ing together the two halves of the current (64-bit) value, and code size can be further reduced by adjusting the arithmetic slightly. v2: Undo the folding of "xchg" and "set". Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F19A2BC020000780006E0DC@nat28.tlf.novell.com Cc: Luca Barbieri Cc: Eric Dumazet Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic64_32.h | 5 ++--- arch/x86/lib/atomic64_386_32.S | 6 +++--- arch/x86/lib/atomic64_cx8_32.S | 29 +++++++++++------------------ 3 files changed, 16 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 908303f68bba..198119910da5 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -288,9 +288,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (v), - "+S" (low), "+D" (high)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), + "S" (v) : "memory"); return (int)a; } diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index e8e7e0d06f42..00933d5e992f 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -137,13 +137,13 @@ BEGIN(dec_return) RET_ENDP #undef v -#define v %ecx +#define v %esi BEGIN(add_unless) - addl %eax, %esi + addl %eax, %ecx adcl %edx, %edi addl (v), %eax adcl 4(v), %edx - cmpl %eax, %esi + cmpl %eax, %ecx je 3f 1: movl %eax, (v) diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 391a083674b4..f5cc9eb1d51b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -55,8 +55,6 @@ ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) CFI_STARTPROC - movl %ebx, %eax - movl %ecx, %edx 1: LOCK_PREFIX cmpxchg8b (%esi) @@ -78,7 +76,7 @@ ENTRY(atomic64_\func\()_return_cx8) movl %edx, %edi movl %ecx, %ebp - read64 %ebp + read64 %ecx 1: movl %eax, %ebx movl %edx, %ecx @@ -159,23 +157,22 @@ ENTRY(atomic64_add_unless_cx8) SAVE ebx /* these just push these two parameters on the stack */ SAVE edi - SAVE esi + SAVE ecx - movl %ecx, %ebp - movl %eax, %esi + movl %eax, %ebp movl %edx, %edi - read64 %ebp + read64 %esi 1: cmpl %eax, 0(%esp) je 4f 2: movl %eax, %ebx movl %edx, %ecx - addl %esi, %ebx + addl %ebp, %ebx adcl %edi, %ecx LOCK_PREFIX - cmpxchg8b (%ebp) + cmpxchg8b (%esi) jne 1b movl $1, %eax @@ -199,13 +196,13 @@ ENTRY(atomic64_inc_not_zero_cx8) read64 %esi 1: - testl %eax, %eax - je 4f -2: + movl %eax, %ecx + orl %edx, %ecx + jz 3f movl %eax, %ebx - movl %edx, %ecx + xorl %ecx, %ecx addl $1, %ebx - adcl $0, %ecx + adcl %edx, %ecx LOCK_PREFIX cmpxchg8b (%esi) jne 1b @@ -214,9 +211,5 @@ ENTRY(atomic64_inc_not_zero_cx8) 3: RESTORE ebx ret -4: - testl %edx, %edx - jne 2b - jmp 3b CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) -- cgit v1.2.3 From 7a7546b377bdaa25ac77f33d9433c59f259b9688 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 23 Jan 2012 19:32:25 +0000 Subject: x86: xen: size struct xen_spinlock to always fit in arch_spinlock_t If NR_CPUS < 256 then arch_spinlock_t is only 16 bits wide but struct xen_spinlock is 32 bits. When a spin lock is contended and xl->spinners is modified the two bytes immediately after the spin lock would be corrupted. This is a regression caused by 84eb950db13ca40a0572ce9957e14723500943d6 (x86, ticketlock: Clean up types and accessors) which reduced the size of arch_spinlock_t. Fix this by making xl->spinners a u8 if NR_CPUS < 256. A BUILD_BUG_ON() is also added to check the sizes of the two structures are compatible. In many cases this was not noticable as there would often be padding bytes after the lock (e.g., if any of CONFIG_GENERIC_LOCKBREAK, CONFIG_DEBUG_SPINLOCK, or CONFIG_DEBUG_LOCK_ALLOC were enabled). The bnx2 driver is affected. In struct bnx2, phy_lock and indirect_lock may have no padding after them. Contention on phy_lock would corrupt indirect_lock making it appear locked and the driver would deadlock. Signed-off-by: David Vrabel Signed-off-by: Jeremy Fitzhardinge Acked-by: Ian Campbell CC: stable@kernel.org #only 3.2 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/spinlock.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cc9b1e182fcf..d69cc6c3f808 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -116,9 +116,26 @@ static inline void spin_time_accum_blocked(u64 start) } #endif /* CONFIG_XEN_DEBUG_FS */ +/* + * Size struct xen_spinlock so it's the same as arch_spinlock_t. + */ +#if NR_CPUS < 256 +typedef u8 xen_spinners_t; +# define inc_spinners(xl) \ + asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory"); +# define dec_spinners(xl) \ + asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory"); +#else +typedef u16 xen_spinners_t; +# define inc_spinners(xl) \ + asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory"); +# define dec_spinners(xl) \ + asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); +#endif + struct xen_spinlock { unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ + xen_spinners_t spinners; /* count of waiting cpus */ }; static int xen_spin_is_locked(struct arch_spinlock *lock) @@ -164,8 +181,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); + inc_spinners(xl); return prev; } @@ -176,8 +192,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) */ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) { - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); + dec_spinners(xl); wmb(); /* decrement count before restoring lock */ __this_cpu_write(lock_spinners, prev); } @@ -373,6 +388,8 @@ void xen_uninit_lock_cpu(int cpu) void __init xen_init_spinlocks(void) { + BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_lock = xen_spin_lock; -- cgit v1.2.3 From 2ed86b16eabe4efbf80cc725a8cbb5310746a2fc Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 25 Jan 2012 20:02:40 -0600 Subject: irq: make SPARSE_IRQ an optionally hidden option On ARM, we don't want SPARSE_IRQ to be a user visible option. Make SPARSE_IRQ visible based on MAY_HAVE_SPARSE_IRQ instead of depending on HAVE_SPARSE_IRQ. With this, SPARSE_IRQ is not visible on C6X and ARM. Signed-off-by: Rob Herring Cc: Russell King Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-c6x-dev@linux-c6x.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org --- arch/arm/Kconfig | 1 - arch/c6x/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 1 - kernel/irq/Kconfig | 5 ++--- 6 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 24626b0419ee..30e7840498ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -28,7 +28,6 @@ config ARM select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_C_RECORDMCOUNT select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ select GENERIC_IRQ_SHOW select CPU_PM if (SUSPEND || CPU_IDLE) select GENERIC_PCI_IOMAP diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 26e67f0f0051..2f58c61e2812 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -11,7 +11,7 @@ config TMS320C6X select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS select HAVE_MEMBLOCK - select HAVE_SPARSE_IRQ + select SPARSE_IRQ select OF select OF_EARLY_FLATTREE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1919634a9b32..06c1cf0f24a6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -133,7 +133,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ + select MAY_HAVE_SPARSE_IRQ select IRQ_PER_CPU select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 3c8db65c89e5..21b82a8cca21 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -22,7 +22,7 @@ config SUPERH select HAVE_SYSCALL_TRACEPOINTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ + select MAY_HAVE_SPARSE_IRQ select IRQ_FORCED_THREADING select RTC_LIB select GENERIC_ATOMIC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..fb2da445945f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..1f2dece9ad4c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,7 +13,7 @@ config GENERIC_HARDIRQS # Options selectable by the architecture code # Make sparse irq Kconfig switch below available -config HAVE_SPARSE_IRQ +config MAY_HAVE_SPARSE_IRQ bool # Enable the generic irq autoprobe mechanism @@ -61,8 +61,7 @@ config IRQ_FORCED_THREADING bool config SPARSE_IRQ - bool "Support sparse irq numbering" - depends on HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ ---help--- Sparse irq numbering is useful for distro kernels that want -- cgit v1.2.3 From 5a51467b146ab7948d2f6812892eac120a30529c Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 18 Jan 2012 20:07:54 -0600 Subject: x86/uv: Fix uv_gpa_to_soc_phys_ram() shift uv_gpa_to_soc_phys_ram() was inadvertently ignoring the shift values. This fix takes the shift into account. Signed-off-by: Russ Anderson Cc: Link: http://lkml.kernel.org/r/20120119020753.GA7228@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 54a13aaebc40..21f7385badb8 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -318,13 +318,13 @@ uv_gpa_in_mmr_space(unsigned long gpa) /* UV global physical address --> socket phys RAM */ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) { - unsigned long paddr = gpa & uv_hub_info->gpa_mask; + unsigned long paddr; unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); - gpa = gpa & uv_hub_info->gpa_mask; + paddr = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; -- cgit v1.2.3 From d2ebc71d472020bc30e29afe8c4d2a85a5b41f56 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 18 Jan 2012 09:40:47 -0600 Subject: x86/uv: Fix uninitialized spinlocks Initialize two spinlocks in tlb_uv.c and also properly define/initialize the uv_irq_lock. The lack of explicit initialization seems to be functionally harmless, but it is diagnosed when these are turned on: CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_LOCKDEP=y Signed-off-by: Cliff Wickman Cc: Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/E1RnXd1-0003wU-PM@eag09.americas.sgi.com [ Added the uv_irq_lock initialization fix by Dimitri Sivanich ] Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 2 ++ arch/x86/platform/uv/uv_irq.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 9be4cff00a2d..3ae0e61abd23 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1851,6 +1851,8 @@ static void __init init_per_cpu_tunables(void) bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; bcp->clocks_per_100_usec = usec_2_cycles(100); + spin_lock_init(&bcp->queue_lock); + spin_lock_init(&bcp->uvhub_lock); } } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 374a05d8ad22..f25c2765a5c9 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -25,7 +25,7 @@ struct uv_irq_2_mmr_pnode{ int irq; }; -static spinlock_t uv_irq_lock; +static DEFINE_SPINLOCK(uv_irq_lock); static struct rb_root uv_irq_root; static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); -- cgit v1.2.3 From 3fe54564a61f72982032423d24041dca30617ca2 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 25 Jan 2012 14:35:49 +0800 Subject: x86/numachip: Drop unnecessary conflict with EDAC EDAC detection no longer crashes multi-node systems, so don't conflict on it with NumaChip. Signed-off-by: Daniel J Blueman Cc: Steffen Persvold Link: http://lkml.kernel.org/r/1327473349-28395-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..5bed94e189fa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -360,7 +360,6 @@ config X86_NUMACHIP depends on NUMA depends on SMP depends on X86_X2APIC - depends on !EDAC_AMD64 ---help--- Adds support for Numascale NumaChip large-SMP systems. Needed to enable more than ~168 cores. -- cgit v1.2.3 From 5067cf53cac9b36d42ebb3a45bb12259d0bc1e68 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 23 Jan 2012 23:34:59 +0100 Subject: x86/boot-image: Don't leak phdrs in arch/x86/boot/compressed/misc.c::Parse_elf() We allocate memory with malloc(), but neglect to free it before the variable 'phdrs' goes out of scope --> leak. Signed-off-by: Jesper Juhl Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1201232332590.8772@swampdragon.chaosbits.net [ Mostly harmless. ] Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 3a19d04cebeb..7116dcba0c9e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -321,6 +321,8 @@ static void parse_elf(void *output) default: /* Ignore other PT_* */ break; } } + + free(phdrs); } asmlinkage void decompress_kernel(void *rmode, memptr heap, -- cgit v1.2.3 From 5d7244e7c984cecead412bde6395ce18618a4a37 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 5 Jan 2012 16:10:42 +0000 Subject: x86-64: Fix memset() to support sizes of 4Gb and above While currently there doesn't appear to be any reachable in-tree case where such large memory blocks may be passed to memset() (alloc_bootmem() being the primary non-reachable one, as it gets called with suitably large sizes in FLATMEM configurations), we have recently hit the problem a second time in our Xen kernels. Rather than working around it a second time, prevent others from falling into the same trap by fixing this long standing limitation. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/lib/memset_64.S | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 79bd454b78a3..2dcb3808cbda 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -19,16 +19,15 @@ .section .altinstr_replacement, "ax", @progbits .Lmemset_c: movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx /* expand byte value */ movzbl %sil,%esi movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ + imulq %rsi,%rax rep stosq - movl %r8d,%ecx + movl %edx,%ecx rep stosb movq %r9,%rax ret @@ -50,7 +49,7 @@ .Lmemset_c_e: movq %rdi,%r9 movb %sil,%al - movl %edx,%ecx + movq %rdx,%rcx rep stosb movq %r9,%rax ret @@ -61,12 +60,11 @@ ENTRY(memset) ENTRY(__memset) CFI_STARTPROC movq %rdi,%r10 - movq %rdx,%r11 /* expand byte value */ movzbl %sil,%ecx movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ + imulq %rcx,%rax /* align dst */ movl %edi,%r9d @@ -75,13 +73,13 @@ ENTRY(__memset) CFI_REMEMBER_STATE .Lafter_bad_alignment: - movl %r11d,%ecx - shrl $6,%ecx + movq %rdx,%rcx + shrq $6,%rcx jz .Lhandle_tail .p2align 4 .Lloop_64: - decl %ecx + decq %rcx movq %rax,(%rdi) movq %rax,8(%rdi) movq %rax,16(%rdi) @@ -97,7 +95,7 @@ ENTRY(__memset) to predict jump tables. */ .p2align 4 .Lhandle_tail: - movl %r11d,%ecx + movl %edx,%ecx andl $63&(~7),%ecx jz .Lhandle_7 shrl $3,%ecx @@ -109,12 +107,11 @@ ENTRY(__memset) jnz .Lloop_8 .Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx + andl $7,%edx jz .Lende .p2align 4 .Lloop_1: - decl %ecx + decl %edx movb %al,(%rdi) leaq 1(%rdi),%rdi jnz .Lloop_1 @@ -125,13 +122,13 @@ ENTRY(__memset) CFI_RESTORE_STATE .Lbad_alignment: - cmpq $7,%r11 + cmpq $7,%rdx jbe .Lhandle_7 movq %rax,(%rdi) /* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi - subq %r8,%r11 + subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -- cgit v1.2.3 From 652847aa449cfe364d40018849223f57f31a38e2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jan 2012 17:38:23 +0100 Subject: x86/amd: Add missing feature flag for fam15h models 10h-1fh processors That is the last one missing for those CPUs. Others were recently added with commits fb215366b3c7320ac25dca766a0152df16534932 (KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to guest) and commit 969df4b82904a30fef19a67398a0c854d223ea67 (x86: Report cpb and eff_freq_ro flags correctly) Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20120120163823.GC24508@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5e..8d67d428b0f9 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -159,6 +159,7 @@ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ -- cgit v1.2.3 From 5b68edc91cdc972c46f76f85eded7ffddc3ff5c2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jan 2012 17:44:12 +0100 Subject: x86/microcode_amd: Add support for CPU family specific container files We've decided to provide CPU family specific container files (starting with CPU family 15h). E.g. for family 15h we have to load microcode_amd_fam15h.bin instead of microcode_amd.bin Rationale is that starting with family 15h patch size is larger than 2KB which was hard coded as maximum patch size in various microcode loaders (not just Linux). Container files which include patches larger than 2KB cause different kinds of trouble with such old patch loaders. Thus we have to ensure that the default container file provides only patches with size less than 2KB. Signed-off-by: Andreas Herrmann Cc: Borislav Petkov Cc: Link: http://lkml.kernel.org/r/20120120164412.GD24508@alberich.amd.com [ documented the naming convention and tidied the code a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index fe86493f3ed1..ac0417be9131 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -311,13 +311,33 @@ out: return state; } +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Starting at family 15h they are in family specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ static enum ucode_state request_microcode_amd(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; + char fw_name[36] = "amd-ucode/microcode_amd.bin"; const struct firmware *fw; enum ucode_state ret = UCODE_NFOUND; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); - if (request_firmware(&fw, fw_name, device)) { + if (request_firmware(&fw, (const char *)fw_name, device)) { pr_err("failed to load file %s\n", fw_name); goto out; } -- cgit v1.2.3 From fc395b9291925b1880e0afc61274fe2f6ddc1269 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:47:37 +0000 Subject: x86: Properly parenthesize cmpxchg() macro arguments Quite oddly, all of the arguments passed through from the top level macros to the second level which didn't need parentheses had them, while the only expression (involving a parameter) needing them didn't. Very recently I got bitten by the lack thereof when using something like "array + index" for the first operand, with "array" being an array more narrow than int. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F2183A9020000780006F3E6@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cmpxchg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 0c9fa2745f13..b3b733262909 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -145,13 +145,13 @@ extern void __add_wrong_size(void) #ifdef __HAVE_ARCH_CMPXCHG #define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg(ptr, old, new, sizeof(*(ptr))) #define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) #define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) #endif /* -- cgit v1.2.3 From 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:50:55 +0000 Subject: x86-64: Fix memcpy() to support sizes of 4Gb and above While currently there doesn't appear to be any reachable in-tree case where such large memory blocks may be passed to memcpy(), we already had hit the problem in our Xen kernels. Just like done recently for mmeset(), rather than working around it, prevent others from falling into the same trap by fixing this long standing limitation. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F21846F020000780006F3FA@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/lib/memcpy_64.S | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index efbf2a0ecdea..1235b04a9a60 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -27,9 +27,8 @@ .section .altinstr_replacement, "ax", @progbits .Lmemcpy_c: movq %rdi, %rax - - movl %edx, %ecx - shrl $3, %ecx + movq %rdx, %rcx + shrq $3, %rcx andl $7, %edx rep movsq movl %edx, %ecx @@ -48,8 +47,7 @@ .section .altinstr_replacement, "ax", @progbits .Lmemcpy_c_e: movq %rdi, %rax - - movl %edx, %ecx + movq %rdx, %rcx rep movsb ret .Lmemcpy_e_e: @@ -60,10 +58,7 @@ ENTRY(memcpy) CFI_STARTPROC movq %rdi, %rax - /* - * Use 32bit CMP here to avoid long NOP padding. - */ - cmp $0x20, %edx + cmpq $0x20, %rdx jb .Lhandle_tail /* @@ -72,7 +67,7 @@ ENTRY(memcpy) */ cmp %dil, %sil jl .Lcopy_backward - subl $0x20, %edx + subq $0x20, %rdx .Lcopy_forward_loop: subq $0x20, %rdx @@ -91,7 +86,7 @@ ENTRY(memcpy) movq %r11, 3*8(%rdi) leaq 4*8(%rdi), %rdi jae .Lcopy_forward_loop - addq $0x20, %rdx + addl $0x20, %edx jmp .Lhandle_tail .Lcopy_backward: @@ -123,11 +118,11 @@ ENTRY(memcpy) /* * Calculate copy position to head. */ - addq $0x20, %rdx + addl $0x20, %edx subq %rdx, %rsi subq %rdx, %rdi .Lhandle_tail: - cmpq $16, %rdx + cmpl $16, %edx jb .Lless_16bytes /* @@ -144,7 +139,7 @@ ENTRY(memcpy) retq .p2align 4 .Lless_16bytes: - cmpq $8, %rdx + cmpl $8, %edx jb .Lless_8bytes /* * Move data from 8 bytes to 15 bytes. @@ -156,7 +151,7 @@ ENTRY(memcpy) retq .p2align 4 .Lless_8bytes: - cmpq $4, %rdx + cmpl $4, %edx jb .Lless_3bytes /* -- cgit v1.2.3 From 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:55:32 +0000 Subject: x86-64: Handle byte-wise tail copying in memcpy() without a loop While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/lib/memcpy_64.S | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1235b04a9a60..1c273be7c97e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -164,18 +164,19 @@ ENTRY(memcpy) retq .p2align 4 .Lless_3bytes: - cmpl $0, %edx - je .Lend + subl $1, %edx + jb .Lend /* * Move data from 1 bytes to 3 bytes. */ -.Lloop_1: - movb (%rsi), %r8b - movb %r8b, (%rdi) - incq %rdi - incq %rsi - decl %edx - jnz .Lloop_1 + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) .Lend: retq -- cgit v1.2.3 From b3eea29c189a0e3e2ac921e85fabfa4989ee58d7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 26 Jan 2012 17:32:04 +0000 Subject: x86/ioapic: Use legacy_pic to set correct gsi-irq mapping Using compile time NR_LEGACY_IRQS causes the wrong gsi-irq mapping on non-PC platforms, such as Moorestown. This patch uses legacy_pic abstraction to set the correct number of legacy interrupts at runtime. For Moorestown, nr_legacy_irqs = 0. We have 1:1 mapping for gsi-irq even within the legacy irq range. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Link: http://lkml.kernel.org/n/tip-kzvj4xp9tmicuoqoh2w05iay@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..9e753663f0d1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin) } else { u32 gsi = gsi_cfg->gsi_base + pin; - if (gsi >= NR_IRQS_LEGACY) + if (gsi >= legacy_pic->nr_legacy_irqs) irq = gsi; else irq = gsi_top + gsi; @@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void) { int nr; - nr = gsi_top + NR_IRQS_LEGACY; + nr = gsi_top + legacy_pic->nr_legacy_irqs; if (nr > nr_irqs_gsi) nr_irqs_gsi = nr; -- cgit v1.2.3 From d450c088fb00d5a744b1fe8648a488035a10a03c Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 26 Jan 2012 17:32:33 +0000 Subject: x86/mrst: Set ISA bus type for fake MP IRQs We use MP IRQs for SFI presented timer interrupts, we should also set mp_bus_not_pci for MP_ISA_BUS so that pin_2_irq mapping is correct. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Link: http://lkml.kernel.org/n/tip-8h3rc1igpp8ir94aas69qmhk@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9e753663f0d1..fb072754bc1d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1010,7 +1010,7 @@ static int pin_2_irq(int idx, int apic, int pin) } else { u32 gsi = gsi_cfg->gsi_base + pin; - if (gsi >= legacy_pic->nr_legacy_irqs) + if (gsi >= NR_IRQS_LEGACY) irq = gsi; else irq = gsi_top + gsi; @@ -3610,7 +3610,7 @@ static void __init probe_nr_irqs_gsi(void) { int nr; - nr = gsi_top + legacy_pic->nr_legacy_irqs; + nr = gsi_top + NR_IRQS_LEGACY; if (nr > nr_irqs_gsi) nr_irqs_gsi = nr; -- cgit v1.2.3 From 1a8359e411eb5055405412a7da812dae63c64a55 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 26 Jan 2012 17:33:30 +0000 Subject: x86/mid: Remove Intel Moorestown All production devices operate in the Oaktrail configuration with legacy PC elements present and an ACPI BIOS. Continue stripping out the Moorestown elements from the tree leaving Medfield. Signed-off-by: Alan Cox Cc: jacob.jun.pan@linux.intel.com Link: http://lkml.kernel.org/n/tip-fvm1hgpq99jln6l0fbek68ik@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 21 -- arch/x86/include/asm/mrst.h | 4 +- arch/x86/platform/mrst/Makefile | 1 - arch/x86/platform/mrst/mrst.c | 64 ++-- arch/x86/platform/mrst/pmu.c | 817 ---------------------------------------- arch/x86/platform/mrst/pmu.h | 234 ------------ 6 files changed, 26 insertions(+), 1115 deletions(-) delete mode 100644 arch/x86/platform/mrst/pmu.c delete mode 100644 arch/x86/platform/mrst/pmu.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..a13addbcbd5e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -418,27 +418,6 @@ if X86_WANT_INTEL_MID config X86_INTEL_MID bool -config X86_MRST - bool "Moorestown MID platform" - depends on PCI - depends on PCI_GOANY - depends on X86_IO_APIC - select X86_INTEL_MID - select SFI - select DW_APB_TIMER - select APB_TIMER - select I2C - select SPI - select INTEL_SCU_IPC - select X86_PLATFORM_DEVICES - ---help--- - Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin - Internet Device(MID) platform. Moorestown consists of two chips: - Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. - Unlike standard x86 PCs, Moorestown does not have many legacy devices - nor standard legacy replacement devices/features. e.g. Moorestown does - not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. - config X86_MDFLD bool "Medfield MID platform" depends on PCI diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 0a0a95460434..fc18bf3ce7c8 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -26,8 +26,8 @@ extern struct sfi_rtc_table_entry sfi_mrtc_array[]; * identified via MSRs. */ enum mrst_cpu_type { - MRST_CPU_CHIP_LINCROFT = 1, - MRST_CPU_CHIP_PENWELL, + /* 1 was Moorestown */ + MRST_CPU_CHIP_PENWELL = 2, }; extern enum mrst_cpu_type __mrst_cpu_chip; diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index 7baed5135e0f..af1da7e623f9 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,3 @@ obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-$(CONFIG_X86_INTEL_MID) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o -obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 475e2cd0f3c3..6743587575a6 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -78,16 +78,11 @@ int sfi_mrtc_num; static void mrst_power_off(void) { - if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) - intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 1); } static void mrst_reboot(void) { - if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) - intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); - else - intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); + intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); } /* parse all the mtimer info to a static mtimer array */ @@ -200,34 +195,28 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) static unsigned long __init mrst_calibrate_tsc(void) { - unsigned long flags, fast_calibrate; - if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) { - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = PENWELL_FSB_FREQ_83SKU; - else - fsb = PENWELL_FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - } else { - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); + unsigned long fast_calibrate; + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); if (fast_calibrate) return fast_calibrate; @@ -261,16 +250,11 @@ static void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; else { - pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", boot_cpu_data.x86, boot_cpu_data.x86_model); - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; } - pr_debug("Moorestown CPU %s identified\n", - (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? - "Lincroft" : "Penwell"); } /* MID systems don't have i8042 controller */ diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c deleted file mode 100644 index c0ac06da57ac..000000000000 --- a/arch/x86/platform/mrst/pmu.c +++ /dev/null @@ -1,817 +0,0 @@ -/* - * mrst/pmu.c - driver for MRST Power Management Unit - * - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "pmu.h" - -#define IPCMSG_FW_REVISION 0xF4 - -struct mrst_device { - u16 pci_dev_num; /* DEBUG only */ - u16 lss; - u16 latest_request; - unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */ -}; - -/* - * comlete list of MRST PCI devices - */ -static struct mrst_device mrst_devs[] = { -/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */ -/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */ -/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */ -/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */ -/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */ -/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */ -/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */ -/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */ -/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */ -/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */ -/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */ -/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */ -/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */ -/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */ -/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */ -/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */ -/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */ -/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */ -/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */ -/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */ -/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */ -/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */ - -/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */ - -/* 23 */ { 0x4102, 0 }, /* Lincroft */ -/* 24 */ { 0x4110, 0 }, /* Lincroft */ -}; - -/* n.b. We ignore PCI-id 0x815 in LSS9 b/c Linux has no driver for it */ -static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; -static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, - 0x0804, 0x0805, 0x080f, 0}; - -/* handle concurrent SMP invokations of pmu_pci_set_power_state() */ -static spinlock_t mrst_pmu_power_state_lock; - -static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */ -static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */ - -static int graphics_is_off; -static int lss_s0i3_enabled; -static bool mrst_pmu_s0i3_enable; - -/* debug counters */ -static u32 pmu_wait_ready_calls; -static u32 pmu_wait_ready_udelays; -static u32 pmu_wait_ready_udelays_max; -static u32 pmu_wait_done_calls; -static u32 pmu_wait_done_udelays; -static u32 pmu_wait_done_udelays_max; -static u32 pmu_set_power_state_entry; -static u32 pmu_set_power_state_send_cmd; - -static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num) -{ - int index = 0; - - if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815)) - index = pci_dev_num - 0x800; - else if (pci_dev_num == 0x084F) - index = 22; - else if (pci_dev_num == 0x4102) - index = 23; - else if (pci_dev_num == 0x4110) - index = 24; - - if (pci_dev_num != mrst_devs[index].pci_dev_num) { - WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num); - return 0; - } - - return &mrst_devs[index]; -} - -/** - * mrst_pmu_validate_cstates - * @dev: cpuidle_device - * - * Certain states are not appropriate for governor to pick in some cases. - * This function will be called as cpuidle_device's prepare callback and - * thus tells governor to ignore such states when selecting the next state - * to enter. - */ - -#define IDLE_STATE4_IS_C6 4 -#define IDLE_STATE5_IS_S0I3 5 - -int mrst_pmu_invalid_cstates(void) -{ - int cpu = smp_processor_id(); - - /* - * Demote to C4 if the PMU is busy. - * Since LSS changes leave the busy bit clear... - * busy means either the PMU is waiting for an ACK-C6 that - * isn't coming due to an MWAIT that returned immediately; - * or we returned from S0i3 successfully, and the PMU - * is not done sending us interrupts. - */ - if (pmu_read_busy_status()) - return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3; - - /* - * Disallow S0i3 if: PMU is not initialized, or CPU1 is active, - * or if device LSS is insufficient, or the GPU is active, - * or if it has been explicitly disabled. - */ - if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) || - !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable) - return 1 << IDLE_STATE5_IS_S0I3; - else - return 0; -} - -/* - * pmu_update_wake_counters(): read PM_WKS, update wake_counters[] - * DEBUG only. - */ -static void pmu_update_wake_counters(void) -{ - int lss; - u32 wake_status; - - wake_status = pmu_read_wks(); - - for (lss = 0; lss < MRST_NUM_LSS; ++lss) { - if (wake_status & (1 << lss)) - wake_counters[lss]++; - } -} - -int mrst_pmu_s0i3_entry(void) -{ - int status; - - /* Clear any possible error conditions */ - pmu_write_ics(0x300); - - /* set wake control to current D-states */ - pmu_write_wssc(S0I3_SSS_TARGET); - - status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd); - pmu_update_wake_counters(); - return status; -} - -/* poll for maximum of 5ms for busy bit to clear */ -static int pmu_wait_ready(void) -{ - int udelays; - - pmu_wait_ready_calls++; - - for (udelays = 0; udelays < 500; ++udelays) { - if (udelays > pmu_wait_ready_udelays_max) - pmu_wait_ready_udelays_max = udelays; - - if (pmu_read_busy_status() == 0) - return 0; - - udelay(10); - pmu_wait_ready_udelays++; - } - - /* - * if this fires, observe - * /sys/kernel/debug/mrst_pmu_wait_ready_calls - * /sys/kernel/debug/mrst_pmu_wait_ready_udelays - */ - WARN_ONCE(1, "SCU not ready for 5ms"); - return -EBUSY; -} -/* poll for maximum of 50ms us for busy bit to clear */ -static int pmu_wait_done(void) -{ - int udelays; - - pmu_wait_done_calls++; - - for (udelays = 0; udelays < 500; ++udelays) { - if (udelays > pmu_wait_done_udelays_max) - pmu_wait_done_udelays_max = udelays; - - if (pmu_read_busy_status() == 0) - return 0; - - udelay(100); - pmu_wait_done_udelays++; - } - - /* - * if this fires, observe - * /sys/kernel/debug/mrst_pmu_wait_done_calls - * /sys/kernel/debug/mrst_pmu_wait_done_udelays - */ - WARN_ONCE(1, "SCU not done for 50ms"); - return -EBUSY; -} - -u32 mrst_pmu_msi_is_disabled(void) -{ - return pmu_msi_is_disabled(); -} - -void mrst_pmu_enable_msi(void) -{ - pmu_msi_enable(); -} - -/** - * pmu_irq - pmu driver interrupt handler - * Context: interrupt context - */ -static irqreturn_t pmu_irq(int irq, void *dummy) -{ - union pmu_pm_ics pmu_ics; - - pmu_ics.value = pmu_read_ics(); - - if (!pmu_ics.bits.pending) - return IRQ_NONE; - - switch (pmu_ics.bits.cause) { - case INT_SPURIOUS: - case INT_CMD_DONE: - case INT_CMD_ERR: - case INT_WAKE_RX: - case INT_SS_ERROR: - case INT_S0IX_MISS: - case INT_NO_ACKC6: - pmu_irq_stats[pmu_ics.bits.cause]++; - break; - default: - pmu_irq_stats[INT_INVALID]++; - } - - pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */ - - return IRQ_HANDLED; -} - -/* - * Translate PCI power management to MRST LSS D-states - */ -static int pci_2_mrst_state(int lss, pci_power_t pci_state) -{ - switch (pci_state) { - case PCI_D0: - if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET) - return D0i1; - else - return D0; - case PCI_D1: - return D0i1; - case PCI_D2: - return D0i2; - case PCI_D3hot: - case PCI_D3cold: - return D0i3; - default: - WARN(1, "pci_state %d\n", pci_state); - return 0; - } -} - -static int pmu_issue_command(u32 pm_ssc) -{ - union pmu_pm_set_cfg_cmd_t command; - - if (pmu_read_busy_status()) { - pr_debug("pmu is busy, Operation not permitted\n"); - return -1; - } - - /* - * enable interrupts in PMU so that interrupts are - * propagated when ioc bit for a particular set - * command is set - */ - - pmu_irq_enable(); - - /* Configure the sub systems for pmu2 */ - - pmu_write_ssc(pm_ssc); - - /* - * Send the set config command for pmu its configured - * for mode CM_IMMEDIATE & hence with No Trigger - */ - - command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE; - command.pmu2_params.d_param.cfg_delay = 0; - command.pmu2_params.d_param.rsvd = 0; - - /* construct the command to send SET_CFG to particular PMU */ - command.pmu2_params.d_param.cmd = SET_CFG_CMD; - command.pmu2_params.d_param.ioc = 0; - command.pmu2_params.d_param.mode_id = 0; - command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0; - - /* write the value of PM_CMD into particular PMU */ - pr_debug("pmu command being written %x\n", - command.pmu_pm_set_cfg_cmd_value); - - pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value); - - return 0; -} - -static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state) -{ - u16 existing_request; - int i; - - for (i = 0; ids[i]; ++i) { - struct mrst_device *mrst_dev; - - mrst_dev = pci_id_2_mrst_dev(ids[i]); - if (unlikely(!mrst_dev)) - continue; - - existing_request = mrst_dev->latest_request; - if (existing_request < pci_state) - pci_state = existing_request; - } - return pci_state; -} - -/** - * pmu_pci_set_power_state - Callback function is used by all the PCI devices - * for a platform specific device power on/shutdown. - */ - -int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state) -{ - u32 old_sss, new_sss; - int status = 0; - struct mrst_device *mrst_dev; - - pmu_set_power_state_entry++; - - BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL); - BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold); - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - if (unlikely(!mrst_dev)) - return -ENODEV; - - mrst_dev->pci_state_counts[pci_state]++; /* count invocations */ - - /* PMU driver calls self as part of PCI initialization, ignore */ - if (pdev->device == PCI_DEV_ID_MRST_PMU) - return 0; - - BUG_ON(!pmu_reg); /* SW bug if called before initialized */ - - spin_lock(&mrst_pmu_power_state_lock); - - if (pdev->d3_delay) { - dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n", - pdev->d3_delay); - pdev->d3_delay = 0; - } - /* - * If Lincroft graphics, simply remember state - */ - if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY - && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) { - if (pci_state == PCI_D0) - graphics_is_off = 0; - else - graphics_is_off = 1; - goto ret; - } - - if (!mrst_dev->lss) - goto ret; /* device with no LSS */ - - if (mrst_dev->latest_request == pci_state) - goto ret; /* no change */ - - mrst_dev->latest_request = pci_state; /* record latest request */ - - /* - * LSS9 and LSS10 contain multiple PCI devices. - * Use the lowest numbered (highest power) state in the LSS - */ - if (mrst_dev->lss == 9) - pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state); - else if (mrst_dev->lss == 10) - pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state); - - status = pmu_wait_ready(); - if (status) - goto ret; - - old_sss = pmu_read_sss(); - new_sss = old_sss & ~SSMSK(3, mrst_dev->lss); - new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state), - mrst_dev->lss); - - if (new_sss == old_sss) - goto ret; /* nothing to do */ - - pmu_set_power_state_send_cmd++; - - status = pmu_issue_command(new_sss); - - if (unlikely(status != 0)) { - dev_err(&pdev->dev, "Failed to Issue a PM command\n"); - goto ret; - } - - if (pmu_wait_done()) - goto ret; - - lss_s0i3_enabled = - ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET); -ret: - spin_unlock(&mrst_pmu_power_state_lock); - return status; -} - -#ifdef CONFIG_DEBUG_FS -static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"}; - -static inline const char *d0ix_name(int state) -{ - return d0ix_names[(int) state]; -} - -static int debug_mrst_pmu_show(struct seq_file *s, void *unused) -{ - struct pci_dev *pdev = NULL; - u32 cur_pmsss; - int lss; - - seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET); - - cur_pmsss = pmu_read_sss(); - - seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET); - - seq_printf(s, "0x%08X Current SSS ", cur_pmsss); - seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n"); - - if (cpumask_equal(cpu_online_mask, cpumask_of(0))) - seq_printf(s, "cpu0 is only cpu online\n"); - else - seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n"); - - seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]"); - - - for_each_pci_dev(pdev) { - int pos; - u16 pmcsr; - struct mrst_device *mrst_dev; - int i; - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - - seq_printf(s, "%s %04x/%04X %-16.16s ", - dev_name(&pdev->dev), - pdev->vendor, pdev->device, - dev_driver_string(&pdev->dev)); - - if (unlikely (!mrst_dev)) { - seq_printf(s, " UNKNOWN\n"); - continue; - } - - if (mrst_dev->lss) - seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss, - d0ix_name(((cur_pmsss >> - (mrst_dev->lss * 2)) & 0x3))); - else - seq_printf(s, " "); - - /* PCI PM config space setting */ - pos = pci_find_capability(pdev, PCI_CAP_ID_PM); - if (pos != 0) { - pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); - seq_printf(s, "PCI-%-4s", - pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK)); - } else { - seq_printf(s, " "); - } - - seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request)); - for (i = 0; i <= PCI_D3cold; ++i) - seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]); - - if (mrst_dev->lss) { - unsigned int lssmask; - - lssmask = SSMSK(D0i3, mrst_dev->lss); - - if ((lssmask & S0I3_SSS_TARGET) && - ((lssmask & cur_pmsss) != - (lssmask & S0I3_SSS_TARGET))) - seq_printf(s , "[BLOCKS s0i3]"); - } - - seq_printf(s, "\n"); - } - seq_printf(s, "Wake Counters:\n"); - for (lss = 0; lss < MRST_NUM_LSS; ++lss) - seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]); - - seq_printf(s, "Interrupt Counters:\n"); - seq_printf(s, - "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n" - "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n" - "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n" - "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n", - pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE], - pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX], - pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS], - pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]); - - seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n", - pmu_wait_ready_calls); - seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n", - pmu_wait_ready_udelays); - seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n", - pmu_wait_ready_udelays_max); - seq_printf(s, "mrst_pmu_wait_done_calls %8d\n", - pmu_wait_done_calls); - seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n", - pmu_wait_done_udelays); - seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n", - pmu_wait_done_udelays_max); - seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n", - pmu_set_power_state_entry); - seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n", - pmu_set_power_state_send_cmd); - seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status()); - - return 0; -} - -static int debug_mrst_pmu_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_mrst_pmu_show, NULL); -} - -static const struct file_operations devices_state_operations = { - .open = debug_mrst_pmu_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* DEBUG_FS */ - -/* - * Validate SCU PCI shim PCI vendor capability byte - * against LSS hard-coded in mrst_devs[] above. - * DEBUG only. - */ -static void pmu_scu_firmware_debug(void) -{ - struct pci_dev *pdev = NULL; - - for_each_pci_dev(pdev) { - struct mrst_device *mrst_dev; - u8 pci_config_lss; - int pos; - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - if (unlikely(!mrst_dev)) { - printk(KERN_ERR FW_BUG "pmu: Unknown " - "PCI device 0x%04X\n", pdev->device); - continue; - } - - if (mrst_dev->lss == 0) - continue; /* no LSS in our table */ - - pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); - if (!pos != 0) { - printk(KERN_ERR FW_BUG "pmu: 0x%04X " - "missing PCI Vendor Capability\n", - pdev->device); - continue; - } - pci_read_config_byte(pdev, pos + 4, &pci_config_lss); - if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) { - printk(KERN_ERR FW_BUG "pmu: 0x%04X " - "invalid PCI Vendor Capability 0x%x " - " expected LSS 0x%X\n", - pdev->device, pci_config_lss, mrst_dev->lss); - continue; - } - pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK; - - if (mrst_dev->lss == pci_config_lss) - continue; - - printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n", - pdev->device, pci_config_lss, mrst_dev->lss); - } -} - -/** - * pmu_probe - */ -static int __devinit pmu_probe(struct pci_dev *pdev, - const struct pci_device_id *pci_id) -{ - int ret; - struct mrst_pmu_reg *pmu; - - /* Init the device */ - ret = pci_enable_device(pdev); - if (ret) { - dev_err(&pdev->dev, "Unable to Enable PCI device\n"); - return ret; - } - - ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME); - if (ret < 0) { - dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n"); - goto out_err1; - } - - /* Map the memory of PMU reg base */ - pmu = pci_iomap(pdev, 0, 0); - if (!pmu) { - dev_err(&pdev->dev, "Unable to map the PMU address space\n"); - ret = -ENOMEM; - goto out_err2; - } - -#ifdef CONFIG_DEBUG_FS - /* /sys/kernel/debug/mrst_pmu */ - (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO, - NULL, NULL, &devices_state_operations); -#endif - pmu_reg = pmu; /* success */ - - if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) { - dev_err(&pdev->dev, "Registering isr has failed\n"); - ret = -1; - goto out_err3; - } - - pmu_scu_firmware_debug(); - - pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */ - - pmu_wait_ready(); - - pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */ - pmu_write_cmd(0x201); - - spin_lock_init(&mrst_pmu_power_state_lock); - - /* Enable the hardware interrupt */ - pmu_irq_enable(); - return 0; - -out_err3: - free_irq(pdev->irq, NULL); - pci_iounmap(pdev, pmu_reg); - pmu_reg = NULL; -out_err2: - pci_release_region(pdev, 0); -out_err1: - pci_disable_device(pdev); - return ret; -} - -static void __devexit pmu_remove(struct pci_dev *pdev) -{ - dev_err(&pdev->dev, "Mid PM pmu_remove called\n"); - - /* Freeing up the irq */ - free_irq(pdev->irq, NULL); - - pci_iounmap(pdev, pmu_reg); - pmu_reg = NULL; - - /* disable the current PCI device */ - pci_release_region(pdev, 0); - pci_disable_device(pdev); -} - -static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = { - { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 }, - { } -}; - -MODULE_DEVICE_TABLE(pci, pmu_pci_ids); - -static struct pci_driver driver = { - .name = MRST_PMU_DRV_NAME, - .id_table = pmu_pci_ids, - .probe = pmu_probe, - .remove = __devexit_p(pmu_remove), -}; - -/** - * pmu_pci_register - register the PMU driver as PCI device - */ -static int __init pmu_pci_register(void) -{ - return pci_register_driver(&driver); -} - -/* Register and probe via fs_initcall() to preceed device_initcall() */ -fs_initcall(pmu_pci_register); - -static void __exit mid_pci_cleanup(void) -{ - pci_unregister_driver(&driver); -} - -static int ia_major; -static int ia_minor; - -static int pmu_sfi_parse_oem(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - - sb = (struct sfi_table_simple *)table; - ia_major = (sb->pentry[1] >> 0) & 0xFFFF; - ia_minor = (sb->pentry[1] >> 16) & 0xFFFF; - printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n", - ia_major, ia_minor); - - return 0; -} - -static int __init scu_fw_check(void) -{ - int ret; - u32 fw_version; - - if (!pmu_reg) - return 0; /* this driver didn't probe-out */ - - sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem); - - if (ia_major < 0x6005 || ia_minor < 0x1525) { - WARN(1, "mrst_pmu: IA FW version too old\n"); - return -1; - } - - ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0, - &fw_version, 1); - - if (ret) { - WARN(1, "mrst_pmu: IPC FW version? %d\n", ret); - } else { - int scu_major = (fw_version >> 8) & 0xFF; - int scu_minor = (fw_version >> 0) & 0xFF; - - printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version); - - if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) { - printk(KERN_INFO "mrst_pmu: enabling S0i3\n"); - mrst_pmu_s0i3_enable = true; - } else { - WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X", - scu_major, scu_minor); - } - } - return 0; -} -late_initcall(scu_fw_check); -module_exit(mid_pci_cleanup); diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h deleted file mode 100644 index bfbfe64b167b..000000000000 --- a/arch/x86/platform/mrst/pmu.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c - * - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#ifndef _MRST_PMU_H_ -#define _MRST_PMU_H_ - -#define PCI_DEV_ID_MRST_PMU 0x0810 -#define MRST_PMU_DRV_NAME "mrst_pmu" -#define PCI_SUB_CLASS_MASK 0xFF00 - -#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F -#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80 - -#define SUB_SYS_ALL_D0I1 0x01155555 -#define S0I3_WAKE_SOURCES 0x00001FFF - -#define PM_S0I3_COMMAND \ - ((0 << 31) | /* Reserved */ \ - (0 << 30) | /* Core must be idle */ \ - (0xc2 << 22) | /* ACK C6 trigger */ \ - (3 << 19) | /* Trigger on DMI message */ \ - (3 << 16) | /* Enter S0i3 */ \ - (0 << 13) | /* Numeric mode ID (sw) */ \ - (3 << 9) | /* Trigger mode */ \ - (0 << 8) | /* Do not interrupt */ \ - (1 << 0)) /* Set configuration */ - -#define LSS_DMI 0 -#define LSS_SD_HC0 1 -#define LSS_SD_HC1 2 -#define LSS_NAND 3 -#define LSS_IMAGING 4 -#define LSS_SECURITY 5 -#define LSS_DISPLAY 6 -#define LSS_USB_HC 7 -#define LSS_USB_OTG 8 -#define LSS_AUDIO 9 -#define LSS_AUDIO_LPE 9 -#define LSS_AUDIO_SSP 9 -#define LSS_I2C0 10 -#define LSS_I2C1 10 -#define LSS_I2C2 10 -#define LSS_KBD 10 -#define LSS_SPI0 10 -#define LSS_SPI1 10 -#define LSS_SPI2 10 -#define LSS_GPIO 10 -#define LSS_SRAM 11 /* used by SCU, do not touch */ -#define LSS_SD_HC2 12 -/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */ -#define MRST_NUM_LSS 13 - -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - -#define SSMSK(mask, lss) ((mask) << ((lss) * 2)) -#define D0 0 -#define D0i1 1 -#define D0i2 2 -#define D0i3 3 - -#define S0I3_SSS_TARGET ( \ - SSMSK(D0i1, LSS_DMI) | \ - SSMSK(D0i3, LSS_SD_HC0) | \ - SSMSK(D0i3, LSS_SD_HC1) | \ - SSMSK(D0i3, LSS_NAND) | \ - SSMSK(D0i3, LSS_SD_HC2) | \ - SSMSK(D0i3, LSS_IMAGING) | \ - SSMSK(D0i3, LSS_SECURITY) | \ - SSMSK(D0i3, LSS_DISPLAY) | \ - SSMSK(D0i3, LSS_USB_HC) | \ - SSMSK(D0i3, LSS_USB_OTG) | \ - SSMSK(D0i3, LSS_AUDIO) | \ - SSMSK(D0i1, LSS_I2C0)) - -/* - * D0i1 on Langwell is Autonomous Clock Gating (ACG). - * Enable ACG on every LSS except camera and audio - */ -#define D0I1_ACG_SSS_TARGET \ - (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO)) - -enum cm_mode { - CM_NOP, /* ignore the config mode value */ - CM_IMMEDIATE, - CM_DELAY, - CM_TRIGGER, - CM_INVALID -}; - -enum sys_state { - SYS_STATE_S0I0, - SYS_STATE_S0I1, - SYS_STATE_S0I2, - SYS_STATE_S0I3, - SYS_STATE_S3, - SYS_STATE_S5 -}; - -#define SET_CFG_CMD 1 - -enum int_status { - INT_SPURIOUS = 0, - INT_CMD_DONE = 1, - INT_CMD_ERR = 2, - INT_WAKE_RX = 3, - INT_SS_ERROR = 4, - INT_S0IX_MISS = 5, - INT_NO_ACKC6 = 6, - INT_INVALID = 7, -}; - -/* PMU register interface */ -static struct mrst_pmu_reg { - u32 pm_sts; /* 0x00 */ - u32 pm_cmd; /* 0x04 */ - u32 pm_ics; /* 0x08 */ - u32 _resv1; /* 0x0C */ - u32 pm_wkc[2]; /* 0x10 */ - u32 pm_wks[2]; /* 0x18 */ - u32 pm_ssc[4]; /* 0x20 */ - u32 pm_sss[4]; /* 0x30 */ - u32 pm_wssc[4]; /* 0x40 */ - u32 pm_c3c4; /* 0x50 */ - u32 pm_c5c6; /* 0x54 */ - u32 pm_msi_disable; /* 0x58 */ -} *pmu_reg; - -static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); } -static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); } -static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); } -static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); } - -static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); } -static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); } -static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); } -static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); } -static inline void pmu_write_wssc(u32 arg) - { writel(arg, &pmu_reg->pm_wssc[0]); } - -static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); } -static inline u32 pmu_msi_is_disabled(void) - { return readl(&pmu_reg->pm_msi_disable); } - -union pmu_pm_ics { - struct { - u32 cause:8; - u32 enable:1; - u32 pending:1; - u32 reserved:22; - } bits; - u32 value; -}; - -static inline void pmu_irq_enable(void) -{ - union pmu_pm_ics pmu_ics; - - pmu_ics.value = pmu_read_ics(); - pmu_ics.bits.enable = 1; - pmu_write_ics(pmu_ics.value); -} - -union pmu_pm_status { - struct { - u32 pmu_rev:8; - u32 pmu_busy:1; - u32 mode_id:4; - u32 Reserved:19; - } pmu_status_parts; - u32 pmu_status_value; -}; - -static inline int pmu_read_busy_status(void) -{ - union pmu_pm_status result; - - result.pmu_status_value = pmu_read_sts(); - - return result.pmu_status_parts.pmu_busy; -} - -/* pmu set config parameters */ -struct cfg_delay_param_t { - u32 cmd:8; - u32 ioc:1; - u32 cfg_mode:4; - u32 mode_id:3; - u32 sys_state:3; - u32 cfg_delay:8; - u32 rsvd:5; -}; - -struct cfg_trig_param_t { - u32 cmd:8; - u32 ioc:1; - u32 cfg_mode:4; - u32 mode_id:3; - u32 sys_state:3; - u32 cfg_trig_type:3; - u32 cfg_trig_val:8; - u32 cmbi:1; - u32 rsvd1:1; -}; - -union pmu_pm_set_cfg_cmd_t { - union { - struct cfg_delay_param_t d_param; - struct cfg_trig_param_t t_param; - } pmu2_params; - u32 pmu_pm_set_cfg_cmd_value; -}; - -#ifdef FUTURE_PATCH -extern int mrst_s0i3_entry(u32 regval, u32 *regaddr); -#else -static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; } -#endif -#endif -- cgit v1.2.3 From 15a713df4145ad2540f8d84c3f4de930806f6151 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 26 Jan 2012 17:35:05 +0000 Subject: x86/config: Select MSIC MFD driver on Intel Medfield platform On Intel Medfield platform we use MSIC MFD driver to create necessary platform devices so it is essential to have the driver compiled into the kernel. Signed-off-by: Mika Westerberg Signed-off-by: Alan Cox Cc: jacob.jun.pan@linux.intel.com Link: http://lkml.kernel.org/n/tip-7hp1otk4wf4mg5pqohcwt06w@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a13addbcbd5e..c0d49316a63d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -431,6 +431,7 @@ config X86_MDFLD select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES + select MFD_INTEL_MSIC ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. -- cgit v1.2.3 From ecfdb0ac15ba983ba4ff11709fdf8f178c0b8b87 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 26 Jan 2012 17:36:08 +0000 Subject: x86/mrst: Add msic_thermal platform support This will let the MSIC driver to create platform device for the thermal driver. Signed-off-by: Mika Westerberg Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Cc: jacob.jun.pan@linux.intel.com Link: http://lkml.kernel.org/n/tip-rh1jaft9tjpzfql76gd56h1q@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6743587575a6..721e65285dce 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -670,6 +670,11 @@ static void *msic_ocd_platform_data(void *info) return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); } +static void *msic_thermal_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); +} + static const struct devs_id __initconst device_ids[] = { {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, @@ -689,6 +694,7 @@ static const struct devs_id __initconst device_ids[] = { {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data}, {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data}, {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data}, + {"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data}, {}, }; -- cgit v1.2.3 From b0f4c4b32c8e3aa0d44fc4dd6c40a9a9a8d66b63 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 26 Jan 2012 08:55:34 -0500 Subject: bugs, x86: Fix printk levels for panic, softlockups and stack dumps rsyslog will display KERN_EMERG messages on a connected terminal. However, these messages are useless/undecipherable for a general user. For example, after a softlockup we get: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Stack: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Call Trace: Message from syslogd@intel-s3e37-04 at Jan 25 14:18:06 ... kernel:Code: ff ff a8 08 75 25 31 d2 48 8d 86 38 e0 ff ff 48 89 d1 0f 01 c8 0f ae f0 48 8b 86 38 e0 ff ff a8 08 75 08 b1 01 4c 89 e0 0f 01 c9 ea 69 dd ff 4c 29 e8 48 89 c7 e8 0f bc da ff 49 89 c4 49 89 This happens because the printk levels for these messages are incorrect. Only an informational message should be displayed on a terminal. I modified the printk levels for various messages in the kernel and tested the output by using the drivers/misc/lkdtm.c kernel modules (ie, softlockups, panics, hard lockups, etc.) and confirmed that the console output was still the same and that the output to the terminals was correct. For example, in the case of a softlockup we now see the much more informative: Message from syslogd@intel-s3e37-04 at Jan 25 10:18:06 ... BUG: soft lockup - CPU4 stuck for 60s! instead of the above confusing messages. AFAICT, the messages no longer have to be KERN_EMERG. In the most important case of a panic we set console_verbose(). As for the other less severe cases the correct data is output to the console and /var/log/messages. Successfully tested by me using the drivers/misc/lkdtm.c module. Signed-off-by: Prarit Bhargava Cc: dzickus@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1327586134-11926-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 3 ++- arch/x86/kernel/dumpstack_64.c | 6 +++--- arch/x86/mm/fault.c | 4 ++-- kernel/watchdog.c | 2 +- lib/bug.c | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775fc..4025fe4f928f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) unsigned short ss; unsigned long sp; #endif - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284bd..42b2bca0b72c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + printk(KERN_DEFAULT "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_EMERG); + 0, KERN_DEFAULT); - printk(KERN_EMERG "Code: "); + printk(KERN_DEFAULT "Code: "); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d74824a708d..f0b4caf85c1a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, stackend = end_of_stack(tsk); if (tsk != &init_task && *stackend != STACK_END_MAGIC) - printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, sig = 0; /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); + printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f52..d117262deba3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); diff --git a/lib/bug.c b/lib/bug.c index 19552096d16b..a28c1415357c 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -169,7 +169,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_EMERG "------------[ cut here ]------------\n"); + printk(KERN_DEFAULT "------------[ cut here ]------------\n"); if (file) printk(KERN_CRIT "kernel BUG at %s:%u!\n", -- cgit v1.2.3 From 08dda402d60a721ac94e79efd7646b332be3e3b2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 26 Jan 2012 16:02:22 -0800 Subject: x86/mce: Replace hard coded hex constants with symbolic defines Magic constants like 0x0134 in code just invite questions on where they come from, what they mean, can they be changed. Provide #defines for the architecturally defined MCACOD values with a reference to the Intel Software Developers manual which describes them. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index f6c92f99efa0..0c82091b1652 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -56,6 +56,12 @@ static struct severity { #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) #define MCACOD 0xffff +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ +#define MCACOD_DATA 0x0134 /* Data Load */ +#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ MCESEV( NO, "Invalid", @@ -112,12 +118,12 @@ static struct severity { #ifdef CONFIG_MEMORY_FAILURE MCESEV( KEEP, "HT thread notices Action required: data load error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), MCGMASK(MCG_STATUS_EIPV, 0) ), MCESEV( AR, "Action required: data load error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|0x0134), + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), #endif @@ -129,11 +135,11 @@ static struct severity { /* known AO MCACODs: */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) ), MCESEV( SOME, "Action optional: unknown MCACOD", -- cgit v1.2.3 From 644e9cbbe3fc032cc92d0936057e166a994dc246 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:05 +0100 Subject: Add driver auto probing for x86 features v4 There's a growing number of drivers that support a specific x86 feature or CPU. Currently loading these drivers currently on a generic distribution requires various driver specific hacks and it often doesn't work. This patch adds auto probing for drivers based on the x86 cpuid information, in particular based on vendor/family/model number and also based on CPUID feature bits. For example a common issue is not loading the SSE 4.2 accelerated CRC module: this can significantly lower the performance of BTRFS which relies on fast CRC. Another issue is loading the right CPUFREQ driver for the current CPU. Currently distributions often try all all possible driver until one sticks, which is not really a good way to do this. It works with existing udev without any changes. The code exports the x86 information as a generic string in sysfs that can be matched by udev's pattern matching. This scheme does not support numeric ranges, so if you want to handle e.g. ranges of model numbers they have to be encoded in ASCII or simply all models or families listed. Fixing that would require changing udev. Another issue is that udev will happily load all drivers that match, there is currently no nice way to stop a specific driver from being loaded if it's not needed (e.g. if you don't need fast CRC) But there are not that many cpu specific drivers around and they're all not that bloated, so this isn't a particularly serious issue. Originally this patch added the modalias to the normal cpu sysdevs. However sysdevs don't have all the infrastructure needed for udev, so it couldn't really autoload drivers. This patch instead adds the CPU modaliases to the cpuid devices, which are real devices with full support for udev. This implies that the cpuid driver has to be loaded to use this. This patch just adds infrastructure, some driver conversions in followups. Thanks to Kay for helping with some sysfs magic. v2: Constifcation, some updates v4: (trenn@suse.de): - Use kzalloc instead of kmalloc to terminate modalias buffer - Use uppercase hex values to match correctly against hex values containing letters Cc: Dave Jones Cc: Kay Sievers Cc: Jen Axboe Cc: Herbert Xu Cc: Huang Ying Cc: Len Brown Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpu_device_id.h | 13 ++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/match.c | 48 +++++++++++++++++++++++++++++ arch/x86/kernel/cpuid.c | 59 +++++++++++++++++++++++++++++++++++- include/linux/mod_devicetable.h | 21 +++++++++++++ scripts/mod/file2alias.c | 24 +++++++++++++++ 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/cpu_device_id.h create mode 100644 arch/x86/kernel/cpu/match.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h new file mode 100644 index 000000000000..ff501e511d91 --- /dev/null +++ b/arch/x86/include/asm/cpu_device_id.h @@ -0,0 +1,13 @@ +#ifndef _CPU_DEVICE_ID +#define _CPU_DEVICE_ID 1 + +/* + * Declare drivers belonging to specific x86 CPUs + * Similar in spirit to pci_device_id and related PCI functions + */ + +#include + +extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dccdcfa..6ab6aa2fdfdd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 000000000000..7acc961422e7 --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f2..7c89880eefd0 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,57 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static ssize_t print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" + "%04X:model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); + static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; + int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = device_create_file(dev, &dev_attr_modalias); + if (err) { + /* keep device around on error. attribute is optional. */ + err = 0; + } + + return 0; } static void cpuid_device_destroy(int cpu) @@ -182,6 +227,17 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } +static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} + static int __init cpuid_init(void) { int i, err = 0; @@ -200,6 +256,7 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b29e7f6f8fa5..cff2cc08f45a 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -571,4 +571,25 @@ struct amba_id { #endif }; +/* + * Match x86 CPUs for CPU specific drivers. + * See documentation of "x86_match_cpu" for details. + */ + +struct x86_cpu_id { + __u16 vendor; + __u16 family; + __u16 model; + __u16 feature; /* bit index */ + kernel_ulong_t driver_data; +}; + +#define X86_FEATURE_MATCH(x) \ + { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x } + +#define X86_VENDOR_ANY 0xffff +#define X86_FAMILY_ANY 0 +#define X86_MODEL_ANY 0 +#define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index c0e14b3f2306..026ba38759ca 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1013,6 +1013,30 @@ static int do_amba_entry(const char *filename, } ADD_TO_DEVTABLE("amba", struct amba_id, do_amba_entry); +/* LOOKS like x86cpu:vendor:VVVV:family:FFFF:model:MMMM:feature:*,FEAT,* + * All fields are numbers. It would be nicer to use strings for vendor + * and feature, but getting those out of the build system here is too + * complicated. + */ + +static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id, + char *alias) +{ + id->feature = TO_NATIVE(id->feature); + id->family = TO_NATIVE(id->family); + id->model = TO_NATIVE(id->model); + id->vendor = TO_NATIVE(id->vendor); + + strcpy(alias, "x86cpu:"); + ADD(alias, "vendor:", id->vendor != X86_VENDOR_ANY, id->vendor); + ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family); + ADD(alias, ":model:", id->model != X86_MODEL_ANY, id->model); + ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature); + strcat(alias, ",*"); + return 1; +} +ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry); + /* Does namelen bytes of name exactly match the symbol? */ static bool sym_is(const char *name, unsigned namelen, const char *symbol) { -- cgit v1.2.3 From 3bd391f056df61e928de1680ff4a3e7e07e5b399 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:06 +0100 Subject: crypto: Add support for x86 cpuid auto loading for x86 crypto drivers Add support for auto-loading of crypto drivers based on cpuid features. This enables auto-loading of the VIA and Intel specific drivers for AES, hashing and CRCs. Requires the earlier infrastructure patch to add x86 modinfo. I kept it all in a single patch for now. I dropped the printks when the driver cpuid doesn't match (imho drivers never should print anything in such a case) One drawback is that udev doesn't know if the drivers are used or not, so they will be unconditionally loaded at boot up. That's better than not loading them at all, like it often happens. Cc: Dave Jones Cc: Kay Sievers Cc: Jen Axboe Cc: Herbert Xu Cc: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/aesni-intel_glue.c | 12 +++++++++--- arch/x86/crypto/crc32c-intel.c | 11 ++++++++--- arch/x86/crypto/ghash-clmulni-intel_glue.c | 12 ++++++++---- drivers/crypto/padlock-aes.c | 9 ++++++++- drivers/crypto/padlock-sha.c | 16 ++++++++-------- 5 files changed, 41 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 545d0ce59818..b3350bd32c60 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1253,14 +1254,19 @@ static struct crypto_alg __rfc4106_alg = { }; #endif + +static const struct x86_cpu_id aesni_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_AES), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); + static int __init aesni_init(void) { int err; - if (!cpu_has_aes) { - printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); + if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; - } if ((err = crypto_fpu_init())) goto fpu_err; diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index b9d00261703c..493f959261f7 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -31,6 +31,7 @@ #include #include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -173,13 +174,17 @@ static struct shash_alg alg = { } }; +static const struct x86_cpu_id crc32c_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XMM4_2), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id); static int __init crc32c_intel_mod_init(void) { - if (cpu_has_xmm4_2) - return crypto_register_shash(&alg); - else + if (!x86_match_cpu(crc32c_cpu_id)) return -ENODEV; + return crypto_register_shash(&alg); } static void __exit crc32c_intel_mod_fini(void) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 976aa64d9a20..b4bf0a63b520 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -20,6 +20,7 @@ #include #include #include +#include #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -294,15 +295,18 @@ static struct ahash_alg ghash_async_alg = { }, }; +static const struct x86_cpu_id pcmul_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */ + {} +}; +MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); + static int __init ghash_pclmulqdqni_mod_init(void) { int err; - if (!cpu_has_pclmulqdq) { - printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" - " detected.\n"); + if (!x86_match_cpu(pcmul_cpu_id)) return -ENODEV; - } err = crypto_register_shash(&ghash_alg); if (err) diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index 29b9469f8378..37b2e9406af6 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -503,12 +504,18 @@ static struct crypto_alg cbc_aes_alg = { } }; +static struct x86_cpu_id padlock_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_XCRYPT), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, padlock_cpu_id); + static int __init padlock_init(void) { int ret; struct cpuinfo_x86 *c = &cpu_data(0); - if (!cpu_has_xcrypt) + if (!x86_match_cpu(padlock_cpu_id)) return -ENODEV; if (!cpu_has_xcrypt_enabled) { diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c index 06bdb4b2c6a6..9266c0e25492 100644 --- a/drivers/crypto/padlock-sha.c +++ b/drivers/crypto/padlock-sha.c @@ -22,6 +22,7 @@ #include #include #include +#include #include struct padlock_sha_desc { @@ -526,6 +527,12 @@ static struct shash_alg sha256_alg_nano = { } }; +static struct x86_cpu_id padlock_sha_ids[] = { + X86_FEATURE_MATCH(X86_FEATURE_PHE), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, padlock_sha_ids); + static int __init padlock_init(void) { int rc = -ENODEV; @@ -533,15 +540,8 @@ static int __init padlock_init(void) struct shash_alg *sha1; struct shash_alg *sha256; - if (!cpu_has_phe) { - printk(KERN_NOTICE PFX "VIA PadLock Hash Engine not detected.\n"); - return -ENODEV; - } - - if (!cpu_has_phe_enabled) { - printk(KERN_NOTICE PFX "VIA PadLock detected, but not enabled. Hmm, strange...\n"); + if (!x86_match_cpu(padlock_sha_ids) || !cpu_has_phe_enabled) return -ENODEV; - } /* Register the newly added algorithm module if on * * VIA Nano processor, or else just do as before */ -- cgit v1.2.3 From 2f1e097e24defe64a86535b53768f5c8ab0368d1 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 26 Jan 2012 00:09:11 +0100 Subject: X86: Introduce HW-Pstate scattered cpuid feature It is rather similar to CPB (boot capability) feature and exists since fam10h (can be looked up in AMD's BKDG). The feature is needed for powernow-k8 to cleanup init functions and to provide proper autoloading matching with the new x86cpu modalias feature. Cc: Kay Sievers Cc: Dave Jones Cc: Borislav Petkov Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5e..67b0910ebbb8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537a..addf9e82a7f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, -- cgit v1.2.3 From 78ff123b05fb15beb1ad670372eea0d299d0b8af Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:13 +0100 Subject: x86: autoload microcode driver on Intel and AMD systems v2 Don't try to describe the actual models for now. v2: Fix typo: X86_VENDOR_ANY -> X86_FAMILY_ANY (trenn) Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/microcode_core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fda91c307104..87a0f8688301 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -86,6 +86,7 @@ #include #include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -504,6 +505,20 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); -- cgit v1.2.3 From fad12ac8c8c2591c7f4e61d19b6a9d76cd49fafa Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 26 Jan 2012 00:09:14 +0100 Subject: CPU: Introduce ARCH_HAS_CPU_AUTOPROBE and X86 parts This patch is based on Andi Kleen's work: Implement autoprobing/loading of modules serving CPU specific features (x86cpu autoloading). And Kay Siever's work to get rid of sysdev cpu structures and making use of struct device instead. Before, the cpuid driver had to be loaded to get the x86cpu autoloading feature. With this patch autoloading works through the /sys/devices/system/cpu object Cc: Kay Sievers Cc: Dave Jones Cc: Jens Axboe Cc: Herbert Xu Cc: Huang Ying Cc: Len Brown Acked-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 3 +++ arch/x86/kernel/cpu/match.c | 44 +++++++++++++++++++++++++++++++++ arch/x86/kernel/cpuid.c | 59 +-------------------------------------------- drivers/base/cpu.c | 11 +++++++++ include/linux/cpu.h | 7 ++++++ 5 files changed, 66 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..6baa1e66e1bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -179,6 +179,9 @@ config ARCH_HAS_DEFAULT_IDLE config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_HAS_CPU_AUTOPROBE + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 7acc961422e7..940e2d483076 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -2,6 +2,7 @@ #include #include #include +#include /** * x86_match_cpu - match current CPU again an array of x86_cpu_ids @@ -46,3 +47,46 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); + +ssize_t arch_print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" + "model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + arch_print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7c89880eefd0..a524353d93f2 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include @@ -139,57 +138,13 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static ssize_t print_cpu_modalias(struct device *dev, - struct device_attribute *attr, - char *bufptr) -{ - int size = PAGE_SIZE; - int i, n; - char *buf = bufptr; - - n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" - "%04X:model:%04X:feature:", - boot_cpu_data.x86_vendor, - boot_cpu_data.x86, - boot_cpu_data.x86_model); - size -= n; - buf += n; - size -= 2; - for (i = 0; i < NCAPINTS*32; i++) { - if (boot_cpu_has(i)) { - n = snprintf(buf, size, ",%04X", i); - if (n < 0) { - WARN(1, "x86 features overflow page\n"); - break; - } - size -= n; - buf += n; - } - } - *buf++ = ','; - *buf++ = '\n'; - return buf - bufptr; -} - -static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); - static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - err = device_create_file(dev, &dev_attr_modalias); - if (err) { - /* keep device around on error. attribute is optional. */ - err = 0; - } - - return 0; + return IS_ERR(dev) ? PTR_ERR(dev) : 0; } static void cpuid_device_destroy(int cpu) @@ -227,17 +182,6 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } -static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (buf) { - print_cpu_modalias(NULL, NULL, buf); - add_uevent_var(env, "MODALIAS=%s", buf); - kfree(buf); - } - return 0; -} - static int __init cpuid_init(void) { int i, err = 0; @@ -256,7 +200,6 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; - cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index db87e78d7459..2a0c670c281d 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "base.h" @@ -223,6 +224,9 @@ int __cpuinit register_cpu(struct cpu *cpu, int num) cpu->node_id = cpu_to_node(num); cpu->dev.id = num; cpu->dev.bus = &cpu_subsys; +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE + cpu->dev.bus->uevent = arch_cpu_uevent; +#endif error = device_register(&cpu->dev); if (!error && cpu->hotpluggable) register_cpu_control(cpu); @@ -247,6 +251,10 @@ struct device *get_cpu_device(unsigned cpu) } EXPORT_SYMBOL_GPL(get_cpu_device); +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE +static DEVICE_ATTR(modalias, 0444, arch_print_cpu_modalias, NULL); +#endif + static struct attribute *cpu_root_attrs[] = { #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE &dev_attr_probe.attr, @@ -257,6 +265,9 @@ static struct attribute *cpu_root_attrs[] = { &cpu_attrs[2].attr.attr, &dev_attr_kernel_max.attr, &dev_attr_offline.attr, +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE + &dev_attr_modalias.attr, +#endif NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 1f6587590a1a..6e53b4823d7f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -44,6 +44,13 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +#ifdef CONFIG_ARCH_HAS_CPU_AUTOPROBE +extern int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env); +extern ssize_t arch_print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr); +#endif + /* * CPU notifier priorities. */ -- cgit v1.2.3 From d0caf292505d051b1026e85faf3a85e907566f31 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 28 Jan 2012 13:52:46 +0300 Subject: x86/dumpstack: Remove unneeded check in dump_trace() Smatch complains that we have some inconsistent NULL checking. If "task" were NULL then it would lead to a NULL dereference later. We can remove this test because earlier on in the function we have: if (!task) task = current; Signed-off-by: Dan Carpenter Acked-by: Frederic Weisbecker Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Clemens Ladisch Link: http://lkml.kernel.org/r/20120128105246.GA25092@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284bd..af7785ff5aa0 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { if (regs) stack = (unsigned long *)regs->sp; - else if (task && task != current) + else if (task != current) stack = (unsigned long *)task->thread.sp; else stack = &dummy; -- cgit v1.2.3 From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:38:29 +0100 Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices The current device suspend/resume phases during system-wide power transitions appear to be insufficient for some platforms that want to use the same callback routines for saving device states and related operations during runtime suspend/resume as well as during system suspend/resume. In principle, they could point their .suspend_noirq() and .resume_noirq() to the same callback routines as their .runtime_suspend() and .runtime_resume(), respectively, but at least some of them require device interrupts to be enabled while the code in those routines is running. It also makes sense to have device suspend-resume callbacks that will be executed with runtime PM disabled and with device interrupts enabled in case someone needs to run some special code in that context during system-wide power transitions. Apart from this, .suspend_noirq() and .resume_noirq() were introduced as a workaround for drivers using shared interrupts and failing to prevent their interrupt handlers from accessing suspended hardware. It appears to be better not to use them for other porposes, or we may have to deal with some serious confusion (which seems to be happening already). For the above reasons, introduce new device suspend/resume phases, "late suspend" and "early resume" (and analogously for hibernation) whose callback will be executed with runtime PM disabled and with device interrupts enabled and whose callback pointers generally may point to runtime suspend/resume routines. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mark Brown Reviewed-by: Kevin Hilman --- Documentation/power/devices.txt | 93 +++++++++------ arch/x86/kernel/apm_32.c | 11 +- drivers/base/power/main.c | 247 ++++++++++++++++++++++++++++++++++++---- drivers/xen/manage.c | 6 +- include/linux/pm.h | 43 +++++-- include/linux/suspend.h | 4 + kernel/kexec.c | 8 +- kernel/power/hibernate.c | 24 ++-- kernel/power/main.c | 8 +- kernel/power/suspend.c | 4 +- 10 files changed, 357 insertions(+), 91 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 20af7def23c8..872815cd41d3 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -96,6 +96,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -305,7 +311,7 @@ Entering System Suspend ----------------------- When the system goes into the standby or memory sleep state, the phases are: - prepare, suspend, suspend_noirq. + prepare, suspend, suspend_late, suspend_noirq. 1. The prepare phase is meant to prevent races by preventing new devices from being registered; the PM core would never know that all the @@ -324,7 +330,12 @@ When the system goes into the standby or memory sleep state, the phases are: appropriate low-power state, depending on the bus type the device is on, and they may enable wakeup events. - 3. The suspend_noirq phase occurs after IRQ handlers have been disabled, + 3 For a number of devices it is convenient to split suspend into the + "quiesce device" and "save device state" phases, in which cases + suspend_late is meant to do the latter. It is always executed after + runtime power management has been disabled for all devices. + + 4. The suspend_noirq phase occurs after IRQ handlers have been disabled, which means that the driver's interrupt handler will not be called while the callback method is running. The methods should save the values of the device's registers that weren't saved previously and finally put the @@ -359,7 +370,7 @@ Leaving System Suspend ---------------------- When resuming from standby or memory sleep, the phases are: - resume_noirq, resume, complete. + resume_noirq, resume_early, resume, complete. 1. The resume_noirq callback methods should perform any actions needed before the driver's interrupt handlers are invoked. This generally @@ -375,14 +386,18 @@ When resuming from standby or memory sleep, the phases are: device driver's ->pm.resume_noirq() method to perform device-specific actions. - 2. The resume methods should bring the the device back to its operating + 2. The resume_early methods should prepare devices for the execution of + the resume methods. This generally involves undoing the actions of the + preceding suspend_late phase. + + 3 The resume methods should bring the the device back to its operating state, so that it can perform normal I/O. This generally involves undoing the actions of the suspend phase. - 3. The complete phase uses only a bus callback. The method should undo the - actions of the prepare phase. Note, however, that new children may be - registered below the device as soon as the resume callbacks occur; it's - not necessary to wait until the complete phase. + 4. The complete phase should undo the actions of the prepare phase. Note, + however, that new children may be registered below the device as soon as + the resume callbacks occur; it's not necessary to wait until the + complete phase. At the end of these phases, drivers should be as functional as they were before suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are @@ -429,8 +444,8 @@ an image of the system memory while everything is stable, reactivate all devices (thaw), write the image to permanent storage, and finally shut down the system (poweroff). The phases used to accomplish this are: - prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete, - prepare, poweroff, poweroff_noirq + prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early, + thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq 1. The prepare phase is discussed in the "Entering System Suspend" section above. @@ -441,7 +456,11 @@ system (poweroff). The phases used to accomplish this are: save time it's best not to do so. Also, the device should not be prepared to generate wakeup events. - 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed + 3. The freeze_late phase is analogous to the suspend_late phase described + above, except that the device should not be put in a low-power state and + should not be allowed to generate wakeup events by it. + + 4. The freeze_noirq phase is analogous to the suspend_noirq phase discussed above, except again that the device should not be put in a low-power state and should not be allowed to generate wakeup events. @@ -449,15 +468,19 @@ At this point the system image is created. All devices should be inactive and the contents of memory should remain undisturbed while this happens, so that the image forms an atomic snapshot of the system state. - 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed + 5. The thaw_noirq phase is analogous to the resume_noirq phase discussed above. The main difference is that its methods can assume the device is in the same state as at the end of the freeze_noirq phase. - 5. The thaw phase is analogous to the resume phase discussed above. Its + 6. The thaw_early phase is analogous to the resume_early phase described + above. Its methods should undo the actions of the preceding + freeze_late, if necessary. + + 7. The thaw phase is analogous to the resume phase discussed above. Its methods should bring the device back to an operating state, so that it can be used for saving the image if necessary. - 6. The complete phase is discussed in the "Leaving System Suspend" section + 8. The complete phase is discussed in the "Leaving System Suspend" section above. At this point the system image is saved, and the devices then need to be @@ -465,16 +488,19 @@ prepared for the upcoming system shutdown. This is much like suspending them before putting the system into the standby or memory sleep state, and the phases are similar. - 7. The prepare phase is discussed above. + 9. The prepare phase is discussed above. + + 10. The poweroff phase is analogous to the suspend phase. - 8. The poweroff phase is analogous to the suspend phase. + 11. The poweroff_late phase is analogous to the suspend_late phase. - 9. The poweroff_noirq phase is analogous to the suspend_noirq phase. + 12. The poweroff_noirq phase is analogous to the suspend_noirq phase. -The poweroff and poweroff_noirq callbacks should do essentially the same things -as the suspend and suspend_noirq callbacks. The only notable difference is that -they need not store the device register values, because the registers should -already have been stored during the freeze or freeze_noirq phases. +The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially +the same things as the suspend, suspend_late and suspend_noirq callbacks, +respectively. The only notable difference is that they need not store the +device register values, because the registers should already have been stored +during the freeze, freeze_late or freeze_noirq phases. Leaving Hibernation @@ -518,22 +544,25 @@ To achieve this, the image kernel must restore the devices' pre-hibernation functionality. The operation is much like waking up from the memory sleep state, although it involves different phases: - restore_noirq, restore, complete + restore_noirq, restore_early, restore, complete 1. The restore_noirq phase is analogous to the resume_noirq phase. - 2. The restore phase is analogous to the resume phase. + 2. The restore_early phase is analogous to the resume_early phase. + + 3. The restore phase is analogous to the resume phase. - 3. The complete phase is discussed above. + 4. The complete phase is discussed above. -The main difference from resume[_noirq] is that restore[_noirq] must assume the -device has been accessed and reconfigured by the boot loader or the boot kernel. -Consequently the state of the device may be different from the state remembered -from the freeze and freeze_noirq phases. The device may even need to be reset -and completely re-initialized. In many cases this difference doesn't matter, so -the resume[_noirq] and restore[_norq] method pointers can be set to the same -routines. Nevertheless, different callback pointers are used in case there is a -situation where it actually matters. +The main difference from resume[_early|_noirq] is that restore[_early|_noirq] +must assume the device has been accessed and reconfigured by the boot loader or +the boot kernel. Consequently the state of the device may be different from the +state remembered from the freeze, freeze_late and freeze_noirq phases. The +device may even need to be reset and completely re-initialized. In many cases +this difference doesn't matter, so the resume[_early|_noirq] and +restore[_early|_norq] method pointers can be set to the same routines. +Nevertheless, different callback pointers are used in case there is a situation +where it actually does matter. Device Power Management Domains diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623cbe263..5d56931a15b3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1234,8 +1234,7 @@ static int suspend(int vetoable) struct apm_user *as; dpm_suspend_start(PMSG_SUSPEND); - - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1259,9 +1258,9 @@ static int suspend(int vetoable) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); - + dpm_resume_start(PMSG_RESUME); dpm_resume_end(PMSG_RESUME); + queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1276,7 @@ static void standby(void) { int err; - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1291,7 +1290,7 @@ static void standby(void) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); } static apm_event_t get_event(void) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e2cc3d2e0ecc..b462c0e341cb 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -47,6 +47,7 @@ typedef int (*pm_callback_t)(struct device *); LIST_HEAD(dpm_list); LIST_HEAD(dpm_prepared_list); LIST_HEAD(dpm_suspended_list); +LIST_HEAD(dpm_late_early_list); LIST_HEAD(dpm_noirq_list); struct suspend_stats suspend_stats; @@ -245,6 +246,40 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) return NULL; } +/** + * pm_late_early_op - Return the PM operation appropriate for given PM event. + * @ops: PM operations to choose from. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, + pm_message_t state) +{ + switch (state.event) { +#ifdef CONFIG_SUSPEND + case PM_EVENT_SUSPEND: + return ops->suspend_late; + case PM_EVENT_RESUME: + return ops->resume_early; +#endif /* CONFIG_SUSPEND */ +#ifdef CONFIG_HIBERNATE_CALLBACKS + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze_late; + case PM_EVENT_HIBERNATE: + return ops->poweroff_late; + case PM_EVENT_THAW: + case PM_EVENT_RECOVER: + return ops->thaw_early; + case PM_EVENT_RESTORE: + return ops->restore_early; +#endif /* CONFIG_HIBERNATE_CALLBACKS */ + } + + return NULL; +} + /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. @@ -374,21 +409,21 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) TRACE_RESUME(0); if (dev->pm_domain) { - info = "EARLY power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "EARLY type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "EARLY class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "EARLY bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "EARLY driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -399,13 +434,13 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) } /** - * dpm_resume_noirq - Execute "early resume" callbacks for non-sysdev devices. + * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * - * Call the "noirq" resume handlers for all devices marked as DPM_OFF_IRQ and + * Call the "noirq" resume handlers for all devices in dpm_noirq_list and * enable device drivers to receive interrupts. */ -void dpm_resume_noirq(pm_message_t state) +static void dpm_resume_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); @@ -415,7 +450,7 @@ void dpm_resume_noirq(pm_message_t state) int error; get_device(dev); - list_move_tail(&dev->power.entry, &dpm_suspended_list); + list_move_tail(&dev->power.entry, &dpm_late_early_list); mutex_unlock(&dpm_list_mtx); error = device_resume_noirq(dev, state); @@ -423,6 +458,80 @@ void dpm_resume_noirq(pm_message_t state) suspend_stats.failed_resume_noirq++; dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, " noirq", error); + } + + mutex_lock(&dpm_list_mtx); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + dpm_show_time(starttime, state, "noirq"); + resume_device_irqs(); +} + +/** + * device_resume_early - Execute an "early resume" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_resume_early(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + int error = 0; + + TRACE_DEVICE(dev); + TRACE_RESUME(0); + + if (dev->pm_domain) { + info = "early power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "early type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "early class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "early bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "early driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + error = dpm_run_callback(callback, dev, state, info); + + TRACE_RESUME(error); + return error; +} + +/** + * dpm_resume_early - Execute "early resume" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static void dpm_resume_early(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.next); + int error; + + get_device(dev); + list_move_tail(&dev->power.entry, &dpm_suspended_list); + mutex_unlock(&dpm_list_mtx); + + error = device_resume_early(dev, state); + if (error) { + suspend_stats.failed_resume_early++; + dpm_save_failed_step(SUSPEND_RESUME_EARLY); + dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, " early", error); } @@ -431,9 +540,18 @@ void dpm_resume_noirq(pm_message_t state) } mutex_unlock(&dpm_list_mtx); dpm_show_time(starttime, state, "early"); - resume_device_irqs(); } -EXPORT_SYMBOL_GPL(dpm_resume_noirq); + +/** + * dpm_resume_start - Execute "noirq" and "early" device callbacks. + * @state: PM transition of the system being carried out. + */ +void dpm_resume_start(pm_message_t state) +{ + dpm_resume_noirq(state); + dpm_resume_early(state); +} +EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. @@ -716,21 +834,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) char *info = NULL; if (dev->pm_domain) { - info = "LATE power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "LATE type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "LATE class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "LATE bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "LATE driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -738,21 +856,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) } /** - * dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices. + * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers from receiving interrupts and call the "noirq" suspend * handlers for all non-sysdev devices. */ -int dpm_suspend_noirq(pm_message_t state) +static int dpm_suspend_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; suspend_device_irqs(); mutex_lock(&dpm_list_mtx); - while (!list_empty(&dpm_suspended_list)) { - struct device *dev = to_device(dpm_suspended_list.prev); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.prev); get_device(dev); mutex_unlock(&dpm_list_mtx); @@ -761,7 +879,7 @@ int dpm_suspend_noirq(pm_message_t state) mutex_lock(&dpm_list_mtx); if (error) { - pm_dev_err(dev, state, " late", error); + pm_dev_err(dev, state, " noirq", error); suspend_stats.failed_suspend_noirq++; dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_save_failed_dev(dev_name(dev)); @@ -775,11 +893,96 @@ int dpm_suspend_noirq(pm_message_t state) mutex_unlock(&dpm_list_mtx); if (error) dpm_resume_noirq(resume_event(state)); + else + dpm_show_time(starttime, state, "noirq"); + return error; +} + +/** + * device_suspend_late - Execute a "late suspend" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_suspend_late(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + + if (dev->pm_domain) { + info = "late power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "late type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "late class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "late bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "late driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + return dpm_run_callback(callback, dev, state, info); +} + +/** + * dpm_suspend_late - Execute "late suspend" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static int dpm_suspend_late(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + int error = 0; + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_suspended_list)) { + struct device *dev = to_device(dpm_suspended_list.prev); + + get_device(dev); + mutex_unlock(&dpm_list_mtx); + + error = device_suspend_late(dev, state); + + mutex_lock(&dpm_list_mtx); + if (error) { + pm_dev_err(dev, state, " late", error); + suspend_stats.failed_suspend_late++; + dpm_save_failed_step(SUSPEND_SUSPEND_LATE); + dpm_save_failed_dev(dev_name(dev)); + put_device(dev); + break; + } + if (!list_empty(&dev->power.entry)) + list_move(&dev->power.entry, &dpm_late_early_list); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + if (error) + dpm_resume_early(resume_event(state)); else dpm_show_time(starttime, state, "late"); + return error; } -EXPORT_SYMBOL_GPL(dpm_suspend_noirq); + +/** + * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. + * @state: PM transition of the system being carried out. + */ +int dpm_suspend_end(pm_message_t state) +{ + int error = dpm_suspend_late(state); + + return error ? : dpm_suspend_noirq(state); +} +EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ce4fa0831860..9e14ae6cd49c 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -129,9 +129,9 @@ static void do_suspend(void) printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_FREEZE); + err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); + printk(KERN_ERR "dpm_suspend_end failed: %d\n", err); goto out_resume; } @@ -149,7 +149,7 @@ static void do_suspend(void) err = stop_machine(xen_suspend, &si, cpumask_of(0)); - dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); diff --git a/include/linux/pm.h b/include/linux/pm.h index e4982ac3fbbc..c68e1f22ac95 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -110,6 +110,10 @@ typedef struct pm_message { * Subsystem-level @suspend() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @suspend_late: Continue operations started by @suspend(). For a number of + * devices @suspend_late() may point to the same callback routine as the + * runtime suspend callback. + * * @resume: Executed after waking the system up from a sleep state in which the * contents of main memory were preserved. The exact action to perform * depends on the device's subsystem, but generally the driver is expected @@ -122,6 +126,10 @@ typedef struct pm_message { * Subsystem-level @resume() is executed for all devices after invoking * subsystem-level @resume_noirq() for all of them. * + * @resume_early: Prepare to execute @resume(). For a number of devices + * @resume_early() may point to the same callback routine as the runtime + * resume callback. + * * @freeze: Hibernation-specific, executed before creating a hibernation image. * Analogous to @suspend(), but it should not enable the device to signal * wakeup events or change its power state. The majority of subsystems @@ -131,6 +139,10 @@ typedef struct pm_message { * Subsystem-level @freeze() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @freeze_late: Continue operations started by @freeze(). Analogous to + * @suspend_late(), but it should not enable the device to signal wakeup + * events or change its power state. + * * @thaw: Hibernation-specific, executed after creating a hibernation image OR * if the creation of an image has failed. Also executed after a failing * attempt to restore the contents of main memory from such an image. @@ -140,15 +152,23 @@ typedef struct pm_message { * subsystem-level @thaw_noirq() for all of them. It also may be executed * directly after @freeze() in case of a transition error. * + * @thaw_early: Prepare to execute @thaw(). Undo the changes made by the + * preceding @freeze_late(). + * * @poweroff: Hibernation-specific, executed after saving a hibernation image. * Analogous to @suspend(), but it need not save the device's settings in * memory. * Subsystem-level @poweroff() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @poweroff_late: Continue operations started by @poweroff(). Analogous to + * @suspend_late(), but it need not save the device's settings in memory. + * * @restore: Hibernation-specific, executed after restoring the contents of main * memory from a hibernation image, analogous to @resume(). * + * @restore_early: Prepare to execute @restore(), analogous to @resume_early(). + * * @suspend_noirq: Complete the actions started by @suspend(). Carry out any * additional operations required for suspending the device that might be * racing with its driver's interrupt handler, which is guaranteed not to @@ -158,9 +178,10 @@ typedef struct pm_message { * @suspend_noirq() has returned successfully. If the device can generate * system wakeup signals and is enabled to wake up the system, it should be * configured to do so at that time. However, depending on the platform - * and device's subsystem, @suspend() may be allowed to put the device into - * the low-power state and configure it to generate wakeup signals, in - * which case it generally is not necessary to define @suspend_noirq(). + * and device's subsystem, @suspend() or @suspend_late() may be allowed to + * put the device into the low-power state and configure it to generate + * wakeup signals, in which case it generally is not necessary to define + * @suspend_noirq(). * * @resume_noirq: Prepare for the execution of @resume() by carrying out any * operations required for resuming the device that might be racing with @@ -171,9 +192,9 @@ typedef struct pm_message { * additional operations required for freezing the device that might be * racing with its driver's interrupt handler, which is guaranteed not to * run while @freeze_noirq() is being executed. - * The power state of the device should not be changed by either @freeze() - * or @freeze_noirq() and it should not be configured to signal system - * wakeup by any of these callbacks. + * The power state of the device should not be changed by either @freeze(), + * or @freeze_late(), or @freeze_noirq() and it should not be configured to + * signal system wakeup by any of these callbacks. * * @thaw_noirq: Prepare for the execution of @thaw() by carrying out any * operations required for thawing the device that might be racing with its @@ -249,6 +270,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -584,13 +611,13 @@ struct dev_pm_domain { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); -extern void dpm_resume_noirq(pm_message_t state); +extern void dpm_resume_start(pm_message_t state); extern void dpm_resume_end(pm_message_t state); extern void dpm_resume(pm_message_t state); extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); -extern int dpm_suspend_noirq(pm_message_t state); +extern int dpm_suspend_end(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); extern int dpm_suspend(pm_message_t state); extern int dpm_prepare(pm_message_t state); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 91784a4f8608..ac1c114c499d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -42,8 +42,10 @@ enum suspend_stat_step { SUSPEND_FREEZE = 1, SUSPEND_PREPARE, SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; @@ -53,8 +55,10 @@ struct suspend_stats { int failed_freeze; int failed_prepare; int failed_suspend; + int failed_suspend_late; int failed_suspend_noirq; int failed_resume; + int failed_resume_early; int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..a5d4cf0aa03e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +518,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +549,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..8c5014a4e052 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..560a639614a1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) -- cgit v1.2.3 From 5955633e91bfc5cd0a41d8d82259e1d8b32980ef Mon Sep 17 00:00:00 2001 From: Michael D Labriola Date: Sun, 29 Jan 2012 14:17:22 -0500 Subject: x86/reboot: Skip DMI checks if reboot set by user Skip DMI checks for vendor specific reboot quirks if the user passed in a reboot= arg on the command line - we should never override user choices. Signed-off-by: Michael D Labriola Cc: Alan Cox Cc: Michael D Labriola Cc: Matthew Garrett Cc: Linus Torvalds Link: http://lkml.kernel.org/r/87wr8ab9od.fsf@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 37a458b521a6..b257f0e28824 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,6 +39,14 @@ static int reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; +/* This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line). This is needed so that we can + * suppress DMI scanning for reboot quirks. Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +static int reboot_default = 1; + #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif @@ -67,6 +75,12 @@ bool port_cf9_safe = false; static int __init reboot_setup(char *str) { for (;;) { + /* Having anything passed on the command line via + * reboot= will cause us to disable DMI checking + * below. + */ + reboot_default = 0; + switch (*str) { case 'w': reboot_mode = 0x1234; @@ -316,7 +330,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { static int __init reboot_init(void) { - dmi_check_system(reboot_dmi_table); + /* Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (reboot_default) { + dmi_check_system(reboot_dmi_table); + } return 0; } core_initcall(reboot_init); @@ -465,7 +484,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { static int __init pci_reboot_init(void) { - dmi_check_system(pci_reboot_dmi_table); + /* Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (reboot_default) { + dmi_check_system(pci_reboot_dmi_table); + } return 0; } core_initcall(pci_reboot_init); -- cgit v1.2.3 From e6d36a653becc7bbc643c399a77882e02bf552cb Mon Sep 17 00:00:00 2001 From: Michael D Labriola Date: Sun, 29 Jan 2012 14:21:17 -0500 Subject: x86/reboot: Remove VersaLogic Menlow reboot quirk This commit removes the reboot quirk originally added by commit e19e074 ("x86: Fix reboot problem on VersaLogic Menlow boards"). Testing with a VersaLogic Ocelot (VL-EPMs-21a rev 1.00 w/ BIOS 6.5.102) revealed the following regarding the reboot hang problem: - v2.6.37 reboot=bios was needed. - v2.6.38-rc1: behavior changed, reboot=acpi is needed, reboot=kbd and reboot=bios results in system hang. - v2.6.38: VersaLogic patch (e19e074 "x86: Fix reboot problem on VersaLogic Menlow boards") was applied prior to v2.6.38-rc7. This patch sets a quirk for VersaLogic Menlow boards that forces the use of reboot=bios, which doesn't work anymore. - v3.2: It seems that commit 660e34c ("x86: Reorder reboot method preferences") changed the default reboot method to acpi prior to v3.0-rc1, which means the default behavior is appropriate for the Ocelot. No VersaLogic quirk is required. The Ocelot board used for testing can successfully reboot w/out having to pass any reboot= arguments for all 3 current versions of the BIOS. Signed-off-by: Michael D Labriola Cc: Matthew Garrett Cc: Michael D Labriola Cc: Kushal Koolwal Cc: Linus Torvalds Link: http://lkml.kernel.org/r/87vcnub9hu.fsf@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index b257f0e28824..d840e69a853c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -309,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, - { /* Handle problems with rebooting on VersaLogic Menlow boards */ - .callback = set_bios_reboot, - .ident = "VersaLogic Menlow based board", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), - DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), - }, - }, { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", -- cgit v1.2.3 From 35f1790e6c6a7e4cae57b616cf36444d27fa6b28 Mon Sep 17 00:00:00 2001 From: He Chunhui Date: Wed, 1 Feb 2012 00:48:28 +0800 Subject: x86, boot: Fix port argument to inl() function "u32 port" in inl() should be "u16 port". [ hpa: it's a bug, but it doesn't produce incorrect code, so no need to put this into urgent or stable. ] Signed-off-by: He Chunhui Link: http://lkml.kernel.org/r/32892299.2931391328028508117.JavaMail.coremail@mailweb Signed-off-by: H. Peter Anvin --- arch/x86/boot/boot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index c7093bd9f2d3..18997e5a1053 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -67,7 +67,7 @@ static inline void outl(u32 v, u16 port) { asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); } -static inline u32 inl(u32 port) +static inline u32 inl(u16 port) { u32 v; asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); -- cgit v1.2.3 From bdb42f5afebe208eae90406959383856ae2caf2b Mon Sep 17 00:00:00 2001 From: Stephan Bärwolf Date: Thu, 12 Jan 2012 16:43:03 +0100 Subject: KVM: x86: extend "struct x86_emulate_ops" with "get_cpuid" In order to be able to proceed checks on CPU-specific properties within the emulator, function "get_cpuid" is introduced. With "get_cpuid" it is possible to virtually call the guests "cpuid"-opcode without changing the VM's context. [mtosatti: cleanup/beautify code] Signed-off-by: Stephan Baerwolf Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 3 +++ arch/x86/kvm/x86.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ab4092e3214e..c8b28689eeeb 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -190,6 +190,9 @@ struct x86_emulate_ops { int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); + + bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 14d6cadc4ba6..8c890e2fa6b6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4180,6 +4180,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + struct kvm_cpuid_entry2 *cpuid = NULL; + + if (eax && ecx) + cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), + *eax, *ecx); + + if (cpuid) { + *eax = cpuid->eax; + *ecx = cpuid->ecx; + if (ebx) + *ebx = cpuid->ebx; + if (edx) + *edx = cpuid->edx; + return true; + } + + return false; +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -4211,6 +4233,7 @@ static struct x86_emulate_ops emulate_ops = { .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, + .get_cpuid = emulator_get_cpuid, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From c2226fc9e87ba3da060e47333657cd6616652b84 Mon Sep 17 00:00:00 2001 From: Stephan Bärwolf Date: Thu, 12 Jan 2012 16:43:04 +0100 Subject: KVM: x86: fix missing checks in syscall emulation On hosts without this patch, 32bit guests will crash (and 64bit guests may behave in a wrong way) for example by simply executing following nasm-demo-application: [bits 32] global _start SECTION .text _start: syscall (I tested it with winxp and linux - both always crashed) Disassembly of section .text: 00000000 <_start>: 0: 0f 05 syscall The reason seems a missing "invalid opcode"-trap (int6) for the syscall opcode "0f05", which is not available on Intel CPUs within non-longmodes, as also on some AMD CPUs within legacy-mode. (depending on CPU vendor, MSR_EFER and cpuid) Because previous mentioned OSs may not engage corresponding syscall target-registers (STAR, LSTAR, CSTAR), they remain NULL and (non trapping) syscalls are leading to multiple faults and finally crashs. Depending on the architecture (AMD or Intel) pretended by guests, various checks according to vendor's documentation are implemented to overcome the current issue and behave like the CPUs physical counterparts. [mtosatti: cleanup/beautify code] Signed-off-by: Stephan Baerwolf Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 13 ++++++++++ arch/x86/kvm/emulate.c | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c8b28689eeeb..7b9cfc4878af 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -301,6 +301,19 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ X86EMUL_MODE_PROT64) +/* CPUID vendors */ +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 + +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 + +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e +#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 + enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 05a562b85025..0982507b962a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1891,6 +1891,51 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } +static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; + u32 eax, ebx, ecx, edx; + + /* + * syscall should always be enabled in longmode - so only become + * vendor specific (cpuid) if other modes are active... + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return true; + + eax = 0x00000000; + ecx = 0x00000000; + if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; + + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; + } + + /* default: (not Intel, not AMD), apply Intel's stricter rules... */ + return false; +} + static int em_syscall(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -1904,9 +1949,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + if (!(em_syscall_is_enabled(ctxt))) + return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, &cs, &ss); + if (!(efer & EFER_SCE)) + return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); -- cgit v1.2.3 From 5753785fa97742d2723ed8ebb29ae59cac912705 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 15 Jan 2012 14:17:22 +0200 Subject: KVM: do not #GP on perf MSR writes when vPMU is disabled Return to behaviour perf MSR had before introducing vPMU in case vPMU is disabled. Some guests access those registers unconditionally and do not expect it to fail. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c890e2fa6b6..9cbfc0698118 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1495,6 +1495,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + bool pr = false; + switch (msr) { case MSR_EFER: return set_efer(vcpu, data); @@ -1635,6 +1637,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + pr = true; + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr, data); + + if (pr || data != 0) + pr_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. @@ -1835,6 +1849,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); + data = 0; + break; case MSR_IA32_UCODE_REV: data = 0x100000000ULL; break; -- cgit v1.2.3 From 84f2b9b2edc09595569c7397cc3c888764ffd78b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 2 Feb 2012 12:04:01 +0100 Subject: perf: Remove deprecated WARN_ON_ONCE() With the new throttling/unthrottling code introduced with commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") we occasionally hit two WARN_ON_ONCE() checks in: - intel_pmu_pebs_enable() - intel_pmu_lbr_enable() - x86_pmu_start() The assertions are no longer problematic. There is a valid path where they can trigger but it is harmless. The assertion can be triggered with: $ perf record -e instructions:pp .... Leading to paths: intel_pmu_pebs_enable intel_pmu_enable_event x86_perf_event_set_period x86_pmu_start perf_adjust_freq_unthr_context perf_event_task_tick scheduler_tick And: intel_pmu_lbr_enable intel_pmu_enable_event x86_perf_event_set_period x86_pmu_start perf_adjust_freq_unthr_context. perf_event_task_tick scheduler_tick cpuc->enabled is always on because when we get to perf_adjust_freq_unthr_context() the PMU is not totally disabled. Furthermore when we need to adjust a period, we only stop the event we need to change and not the entire PMU. Thus, when we re-enable, cpuc->enabled is already set. Note that when we stop the event, both pebs and lbr are stopped if necessary (and possible). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20120202110401.GA30911@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 1 - arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..2a30e5ae6acf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,9 +986,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - if (WARN_ON_ONCE(idx == -1)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 73da6b64f5b7..d6bd49faa40c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - WARN_ON_ONCE(cpuc->enabled); if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) intel_pmu_lbr_enable(event); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce96..47a7e63bfe54 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - WARN_ON_ONCE(cpuc->enabled); - /* * Reset the LBR stack if we changed task context to * avoid data leaks. -- cgit v1.2.3 From 41bd956de3dfdc3a43708fe2e0c8096c69064a1e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 1 Feb 2012 15:56:54 -0500 Subject: xen/smp: Fix CPU online/offline bug triggering a BUG: scheduling while atomic. When a user offlines a VCPU and then onlines it, we get: NMI watchdog disabled (cpu2): hardware events not enabled BUG: scheduling while atomic: swapper/2/0/0x00000002 Modules linked in: dm_multipath dm_mod xen_evtchn iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi scsi_mod libcrc32c crc32c radeon fbco ttm bitblit softcursor drm_kms_helper xen_blkfront xen_netfront xen_fbfront fb_sys_fops sysimgblt sysfillrect syscopyarea xen_kbdfront xenfs [last unloaded: Pid: 0, comm: swapper/2 Tainted: G O 3.2.0phase15.1-00003-gd6f7f5b-dirty #4 Call Trace: [] __schedule_bug+0x61/0x70 [] __schedule+0x798/0x850 [] schedule+0x3a/0x50 [] cpu_idle+0xbe/0xe0 [] cpu_bringup_and_idle+0xe/0x10 The reason for this should be obvious from this call-chain: cpu_bringup_and_idle: \- cpu_bringup | \-[preempt_disable] | |- cpu_idle \- play_dead [assuming the user offlined the VCPU] | \ | +- (xen_play_dead) | \- HYPERVISOR_VCPU_off [so VCPU is dead, once user | | onlines it starts from here] | \- cpu_bringup [preempt_disable] | +- preempt_enable_no_reschedule() +- schedule() \- preempt_enable() So we have two preempt_disble() and one preempt_enable(). Calling preempt_enable() after the cpu_bringup() in the xen_play_dead fixes the imbalance. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 041d4fe9dfe4..501d4e0244ba 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -409,6 +409,13 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); + /* + * Balance out the preempt calls - as we are running in cpu_idle + * loop which has been called at bootup from cpu_bringup_and_idle. + * The cpucpu_bringup_and_idle called cpu_bringup which made a + * preempt_disable() So this preempt_enable will balance it out. + */ + preempt_enable(); } #else /* !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 207d543f472c1ac9552df79838dc807cbcaa9740 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Jan 2012 14:31:46 +0000 Subject: xen pvhvm: do not remap pirqs onto evtchns if !xen_have_vector_callback CC: stable@kernel.org #2.6.37 and onwards Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 492ade8c978e..d99346ea8fdb 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -374,7 +374,7 @@ int __init pci_xen_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI -- cgit v1.2.3 From 0ac2526064e5c7da5d1a47b48d37c1345e487258 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Wed, 1 Feb 2012 10:47:01 -0700 Subject: scx200_32: use PCI_VDEVICE Replace PCI_DEVICE with PCI_VDEVICE to shorten device table. Signed-off-by: Jim Cromie Signed-off-by: Jiri Kosina --- arch/x86/platform/scx200/scx200_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c index 7e004acbe526..5dfd335ad50c 100644 --- a/arch/x86/platform/scx200/scx200_32.c +++ b/arch/x86/platform/scx200/scx200_32.c @@ -29,10 +29,10 @@ unsigned long scx200_gpio_shadow[2]; unsigned scx200_cb_base = 0; static struct pci_device_id scx200_tbl[] = { - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, + { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, + { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, + { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, + { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, { }, }; MODULE_DEVICE_TABLE(pci,scx200_tbl); -- cgit v1.2.3 From 8ad95f0958d3ea9a182ee4c5449c4847e0577cdc Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Wed, 1 Feb 2012 10:58:49 -0700 Subject: scx200_32: replace printks with pr_s update scx200_32.c to use pr_, also 2 whitespaces. Signed-off-by: Jim Cromie Signed-off-by: Jiri Kosina --- arch/x86/platform/scx200/scx200_32.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c index 5dfd335ad50c..7a9ad30d6c9f 100644 --- a/arch/x86/platform/scx200/scx200_32.c +++ b/arch/x86/platform/scx200/scx200_32.c @@ -17,8 +17,6 @@ /* Verify that the configuration block really is there */ #define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) -#define NAME "scx200" - MODULE_AUTHOR("Christer Weinigel "); MODULE_DESCRIPTION("NatSemi SCx200 Driver"); MODULE_LICENSE("GPL"); @@ -63,10 +61,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { base = pci_resource_start(pdev, 0); - printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); + pr_info("GPIO base 0x%x\n", base); - if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) { - printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); + if (!request_region(base, SCx200_GPIO_SIZE, + "NatSemi SCx200 GPIO")) { + pr_err("can't allocate I/O for GPIOs\n"); return -EBUSY; } @@ -82,11 +81,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ if (scx200_cb_probe(base)) { scx200_cb_base = base; } else { - printk(KERN_WARNING NAME ": Configuration Block not found\n"); + pr_warn("Configuration Block not found\n"); return -ENODEV; } } - printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); + pr_info("Configuration Block base 0x%x\n", scx200_cb_base); } return 0; @@ -111,8 +110,7 @@ u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) static int __init scx200_init(void) { - printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); - + pr_info("NatSemi SCx200 Driver\n"); return pci_register_driver(&scx200_pci_driver); } -- cgit v1.2.3 From b43ab901d671e3e3cad425ea5e9a3c74e266dcdd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 27 Jun 2011 09:26:23 +0200 Subject: gpio: Add a driver for Sodaville GPIO controller Sodaville has GPIO controller behind the PCI bus. To my suprissed it is not the same as on PXA. The interrupt & gpio chip can be referenced from the device tree like from any other driver. Unfortunately the driver which uses the gpio interrupt has to use irq_of_parse_and_map() instead of platform_get_irq(). The problem is that the platform device (which is created from the device tree) is most likely created before the interrupt chip is registered and therefore irq_of_parse_and_map() fails. In theory the driver works as module. In reality most of the irq functions are not exported to modules and it is possible that _this_ module is unloaded while the provided irqs are still in use. Signed-off-by: Hans J. Koch [torbenh@linutronix.de: make it work after the irq namespace cleanup, add some device tree entries.] Signed-off-by: Torben Hohn [bigeasy@linutronix.de: convert to generic irq & gpio chip] Signed-off-by: Sebastian Andrzej Siewior [grant.likely@secretlab.ca: depend on x86 to avoid irq_domain breakage] Signed-off-by: Grant Likely --- .../devicetree/bindings/gpio/sodaville.txt | 48 ++++ arch/x86/platform/ce4100/falconfalls.dts | 7 +- drivers/gpio/Kconfig | 8 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio-sodaville.c | 302 +++++++++++++++++++++ 5 files changed, 364 insertions(+), 2 deletions(-) create mode 100644 Documentation/devicetree/bindings/gpio/sodaville.txt create mode 100644 drivers/gpio/gpio-sodaville.c (limited to 'arch/x86') diff --git a/Documentation/devicetree/bindings/gpio/sodaville.txt b/Documentation/devicetree/bindings/gpio/sodaville.txt new file mode 100644 index 000000000000..563eff22b975 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/sodaville.txt @@ -0,0 +1,48 @@ +GPIO controller on CE4100 / Sodaville SoCs +========================================== + +The bindings for CE4100's GPIO controller match the generic description +which is covered by the gpio.txt file in this folder. + +The only additional property is the intel,muxctl property which holds the +value which is written into the MUXCNTL register. + +There is no compatible property for now because the driver is probed via +PCI id (vendor 0x8086 device 0x2e67). + +The interrupt specifier consists of two cells encoded as follows: + - <1st cell>: The interrupt-number that identifies the interrupt source. + - <2nd cell>: The level-sense information, encoded as follows: + 4 - active high level-sensitive + 8 - active low level-sensitive + +Example of the GPIO device and one user: + + pcigpio: gpio@b,1 { + /* two cells for GPIO and interrupt */ + #gpio-cells = <2>; + #interrupt-cells = <2>; + compatible = "pci8086,2e67.2", + "pci8086,2e67", + "pciclassff0000", + "pciclassff00"; + + reg = <0x15900 0x0 0x0 0x0 0x0>; + /* Interrupt line of the gpio device */ + interrupts = <15 1>; + /* It is an interrupt and GPIO controller itself */ + interrupt-controller; + gpio-controller; + intel,muxctl = <0>; + }; + + testuser@20 { + compatible = "example,testuser"; + /* User the 11th GPIO line as an active high triggered + * level interrupt + */ + interrupts = <11 8>; + interrupt-parent = <&pcigpio>; + /* Use this GPIO also with the gpio functions */ + gpios = <&pcigpio 11 0>; + }; diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index e70be38ce039..ce874f872cc6 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -208,16 +208,19 @@ interrupts = <14 1>; }; - gpio@b,1 { + pcigpio: gpio@b,1 { + #gpio-cells = <2>; + #interrupt-cells = <2>; compatible = "pci8086,2e67.2", "pci8086,2e67", "pciclassff0000", "pciclassff00"; - #gpio-cells = <2>; reg = <0x15900 0x0 0x0 0x0 0x0>; interrupts = <15 1>; + interrupt-controller; gpio-controller; + intel,muxctl = <0>; }; i2c-controller@b,2 { diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index eaa7d3828e70..dbb1909ca0a2 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -417,6 +417,14 @@ config GPIO_ML_IOH Hub) which is for IVI(In-Vehicle Infotainment) use. This driver can access the IOH's GPIO device. +config GPIO_SODAVILLE + bool "Intel Sodaville GPIO support" + depends on X86 && PCI && OF + select GPIO_GENERIC + select GENERIC_IRQ_CHIP + help + Say Y here to support Intel Sodaville GPIO. + config GPIO_TIMBERDALE bool "Support for timberdale GPIO IP" depends on MFD_TIMBERDALE && HAS_IOMEM diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 8863a7f2dece..593bdcd1976e 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_GPIO_RDC321X) += gpio-rdc321x.o obj-$(CONFIG_PLAT_SAMSUNG) += gpio-samsung.o obj-$(CONFIG_ARCH_SA1100) += gpio-sa1100.o obj-$(CONFIG_GPIO_SCH) += gpio-sch.o +obj-$(CONFIG_GPIO_SODAVILLE) += gpio-sodaville.o obj-$(CONFIG_GPIO_STMPE) += gpio-stmpe.o obj-$(CONFIG_GPIO_SX150X) += gpio-sx150x.o obj-$(CONFIG_GPIO_TC3589X) += gpio-tc3589x.o diff --git a/drivers/gpio/gpio-sodaville.c b/drivers/gpio/gpio-sodaville.c new file mode 100644 index 000000000000..9ba15d31d242 --- /dev/null +++ b/drivers/gpio/gpio-sodaville.c @@ -0,0 +1,302 @@ +/* + * GPIO interface for Intel Sodaville SoCs. + * + * Copyright (c) 2010, 2011 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published + * by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "sdv_gpio" +#define SDV_NUM_PUB_GPIOS 12 +#define PCI_DEVICE_ID_SDV_GPIO 0x2e67 +#define GPIO_BAR 0 + +#define GPOUTR 0x00 +#define GPOER 0x04 +#define GPINR 0x08 + +#define GPSTR 0x0c +#define GPIT1R0 0x10 +#define GPIO_INT 0x14 +#define GPIT1R1 0x18 + +#define GPMUXCTL 0x1c + +struct sdv_gpio_chip_data { + int irq_base; + void __iomem *gpio_pub_base; + struct irq_domain id; + struct irq_chip_generic *gc; + struct bgpio_chip bgpio; +}; + +static int sdv_gpio_pub_set_type(struct irq_data *d, unsigned int type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct sdv_gpio_chip_data *sd = gc->private; + void __iomem *type_reg; + u32 irq_offs = d->irq - sd->irq_base; + u32 reg; + + if (irq_offs < 8) + type_reg = sd->gpio_pub_base + GPIT1R0; + else + type_reg = sd->gpio_pub_base + GPIT1R1; + + reg = readl(type_reg); + + switch (type) { + case IRQ_TYPE_LEVEL_HIGH: + reg &= ~BIT(4 * (irq_offs % 8)); + break; + + case IRQ_TYPE_LEVEL_LOW: + reg |= BIT(4 * (irq_offs % 8)); + break; + + default: + return -EINVAL; + } + + writel(reg, type_reg); + return 0; +} + +static irqreturn_t sdv_gpio_pub_irq_handler(int irq, void *data) +{ + struct sdv_gpio_chip_data *sd = data; + u32 irq_stat = readl(sd->gpio_pub_base + GPSTR); + + irq_stat &= readl(sd->gpio_pub_base + GPIO_INT); + if (!irq_stat) + return IRQ_NONE; + + while (irq_stat) { + u32 irq_bit = __fls(irq_stat); + + irq_stat &= ~BIT(irq_bit); + generic_handle_irq(sd->irq_base + irq_bit); + } + + return IRQ_HANDLED; +} + +static int sdv_xlate(struct irq_domain *h, struct device_node *node, + const u32 *intspec, u32 intsize, irq_hw_number_t *out_hwirq, + u32 *out_type) +{ + u32 line, type; + + if (node != h->of_node) + return -EINVAL; + + if (intsize < 2) + return -EINVAL; + + line = *intspec; + *out_hwirq = line; + + intspec++; + type = *intspec; + + switch (type) { + case IRQ_TYPE_LEVEL_LOW: + case IRQ_TYPE_LEVEL_HIGH: + *out_type = type; + break; + default: + return -EINVAL; + } + return 0; +} + +static struct irq_domain_ops irq_domain_sdv_ops = { + .dt_translate = sdv_xlate, +}; + +static __devinit int sdv_register_irqsupport(struct sdv_gpio_chip_data *sd, + struct pci_dev *pdev) +{ + struct irq_chip_type *ct; + int ret; + + sd->irq_base = irq_alloc_descs(-1, 0, SDV_NUM_PUB_GPIOS, -1); + if (sd->irq_base < 0) + return sd->irq_base; + + /* mask + ACK all interrupt sources */ + writel(0, sd->gpio_pub_base + GPIO_INT); + writel((1 << 11) - 1, sd->gpio_pub_base + GPSTR); + + ret = request_irq(pdev->irq, sdv_gpio_pub_irq_handler, IRQF_SHARED, + "sdv_gpio", sd); + if (ret) + goto out_free_desc; + + sd->id.irq_base = sd->irq_base; + sd->id.of_node = of_node_get(pdev->dev.of_node); + sd->id.ops = &irq_domain_sdv_ops; + + /* + * This gpio irq controller latches level irqs. Testing shows that if + * we unmask & ACK the IRQ before the source of the interrupt is gone + * then the interrupt is active again. + */ + sd->gc = irq_alloc_generic_chip("sdv-gpio", 1, sd->irq_base, + sd->gpio_pub_base, handle_fasteoi_irq); + if (!sd->gc) { + ret = -ENOMEM; + goto out_free_irq; + } + + sd->gc->private = sd; + ct = sd->gc->chip_types; + ct->type = IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW; + ct->regs.eoi = GPSTR; + ct->regs.mask = GPIO_INT; + ct->chip.irq_mask = irq_gc_mask_clr_bit; + ct->chip.irq_unmask = irq_gc_mask_set_bit; + ct->chip.irq_eoi = irq_gc_eoi; + ct->chip.irq_set_type = sdv_gpio_pub_set_type; + + irq_setup_generic_chip(sd->gc, IRQ_MSK(SDV_NUM_PUB_GPIOS), + IRQ_GC_INIT_MASK_CACHE, IRQ_NOREQUEST, + IRQ_LEVEL | IRQ_NOPROBE); + + irq_domain_add(&sd->id); + return 0; +out_free_irq: + free_irq(pdev->irq, sd); +out_free_desc: + irq_free_descs(sd->irq_base, SDV_NUM_PUB_GPIOS); + return ret; +} + +static int __devinit sdv_gpio_probe(struct pci_dev *pdev, + const struct pci_device_id *pci_id) +{ + struct sdv_gpio_chip_data *sd; + unsigned long addr; + const void *prop; + int len; + int ret; + u32 mux_val; + + sd = kzalloc(sizeof(struct sdv_gpio_chip_data), GFP_KERNEL); + if (!sd) + return -ENOMEM; + ret = pci_enable_device(pdev); + if (ret) { + dev_err(&pdev->dev, "can't enable device.\n"); + goto done; + } + + ret = pci_request_region(pdev, GPIO_BAR, DRV_NAME); + if (ret) { + dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", GPIO_BAR); + goto disable_pci; + } + + addr = pci_resource_start(pdev, GPIO_BAR); + if (!addr) + goto release_reg; + sd->gpio_pub_base = ioremap(addr, pci_resource_len(pdev, GPIO_BAR)); + + prop = of_get_property(pdev->dev.of_node, "intel,muxctl", &len); + if (prop && len == 4) { + mux_val = of_read_number(prop, 1); + writel(mux_val, sd->gpio_pub_base + GPMUXCTL); + } + + ret = bgpio_init(&sd->bgpio, &pdev->dev, 4, + sd->gpio_pub_base + GPINR, sd->gpio_pub_base + GPOUTR, + NULL, sd->gpio_pub_base + GPOER, NULL, false); + if (ret) + goto unmap; + sd->bgpio.gc.ngpio = SDV_NUM_PUB_GPIOS; + + ret = gpiochip_add(&sd->bgpio.gc); + if (ret < 0) { + dev_err(&pdev->dev, "gpiochip_add() failed.\n"); + goto unmap; + } + + ret = sdv_register_irqsupport(sd, pdev); + if (ret) + goto unmap; + + pci_set_drvdata(pdev, sd); + dev_info(&pdev->dev, "Sodaville GPIO driver registered.\n"); + return 0; + +unmap: + iounmap(sd->gpio_pub_base); +release_reg: + pci_release_region(pdev, GPIO_BAR); +disable_pci: + pci_disable_device(pdev); +done: + kfree(sd); + return ret; +} + +static void sdv_gpio_remove(struct pci_dev *pdev) +{ + struct sdv_gpio_chip_data *sd = pci_get_drvdata(pdev); + + irq_domain_del(&sd->id); + free_irq(pdev->irq, sd); + irq_free_descs(sd->irq_base, SDV_NUM_PUB_GPIOS); + + if (gpiochip_remove(&sd->bgpio.gc)) + dev_err(&pdev->dev, "gpiochip_remove() failed.\n"); + + pci_release_region(pdev, GPIO_BAR); + iounmap(sd->gpio_pub_base); + pci_disable_device(pdev); + kfree(sd); +} + +static struct pci_device_id sdv_gpio_pci_ids[] __devinitdata = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_SDV_GPIO) }, + { 0, }, +}; + +static struct pci_driver sdv_gpio_driver = { + .name = DRV_NAME, + .id_table = sdv_gpio_pci_ids, + .probe = sdv_gpio_probe, + .remove = sdv_gpio_remove, +}; + +static int __init sdv_gpio_init(void) +{ + return pci_register_driver(&sdv_gpio_driver); +} +module_init(sdv_gpio_init); + +static void __exit sdv_gpio_exit(void) +{ + pci_unregister_driver(&sdv_gpio_driver); +} +module_exit(sdv_gpio_exit); + +MODULE_AUTHOR("Hans J. Koch "); +MODULE_DESCRIPTION("GPIO interface for Intel Sodaville SoCs"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 7931d493051ea9b09e4fddee2dc40b2eb88d62b9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2012 15:06:26 +0000 Subject: x86/spinlocks: Eliminate TICKET_MASK The definition of it being questionable already (unnecessarily including a cast), and it being used in a single place that can be written shorter without it, remove this #define. Along the same lines, simplify __ticket_spin_is_locked()'s main expression, which was the more convoluted way because of needs that went away with the recent type changes by Jeremy. This is pure cleanup, no functional change intended. Signed-off-by: Jan Beulich Acked-by: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 4 ++-- arch/x86/include/asm/spinlock_types.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index a82c2bf504b6..76bfa2cf301d 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return !!(tmp.tail ^ tmp.head); + return tmp.tail != tmp.head; } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return ((tmp.tail - tmp.head) & TICKET_MASK) > 1; + return (__ticket_t)(tmp.tail - tmp.head) > 1; } #ifndef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 8ebd5df7451e..ad0ad07fc006 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -16,7 +16,6 @@ typedef u32 __ticketpair_t; #endif #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#define TICKET_MASK ((__ticket_t)((1 << TICKET_SHIFT) - 1)) typedef struct arch_spinlock { union { -- cgit v1.2.3 From c1d2f1bccf4259384e581b937e694ee8a350fe55 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 6 Feb 2012 13:28:55 -0500 Subject: x86/microcode: Remove noisy AMD microcode warning AMD processors will never support /dev/cpu/microcode updating so just silently fail instead of printing out a warning for every cpu. Signed-off-by: Prarit Bhargava Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1328552935-965-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ac0417be9131..73465aab28f8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -360,7 +360,6 @@ out: static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } -- cgit v1.2.3 From c98fdeaa92731308ed80386261fa2589addefa47 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Feb 2012 13:08:52 +0100 Subject: x86/sched/perf/AMD: Set sched_clock_stable Stephane Eranian reported that doing a scheduler latency measurements with perf on AMD doesn't work out as expected due to the fact that the sched_clock() granularity is too coarse, i.e. done in jiffies due to the sched_clock_stable not set, which, if set, would mean that we get to use the TSC as sample source which would give us much higher precision. However, there's no reason not to set sched_clock_stable on AMD because all families from F10h and upwards do have an invariant TSC and have the CPUID flag to prove (CPUID_8000_0007_EDX[8]). Make it so, #1. Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Venki Pallipadi Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Robert Richter Cc: Eric Dumazet Cc: Andreas Herrmann Link: http://lkml.kernel.org/r/20120206132546.GA30854@quad [ Should any non-standard system break the TSC, we should mark them so explicitly, in their platform init handler, or in a DMI quirk. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4aae35..0a44b90602b0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -456,6 +457,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + if (!check_tsc_unstable()) + sched_clock_stable = 1; } #ifdef CONFIG_X86_64 -- cgit v1.2.3 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ kernel/events/core.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2a30e5ae6acf..5adce1040b11 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + if (WARN_ON_ONCE(idx == -1)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013cfb21..1b5c081d8b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.2.3 From 32c3233885eb10ac9cb9410f2f8cd64b8df2b2a1 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 8 Feb 2012 20:52:29 +0100 Subject: x86/amd: Fix L1i and L2 cache sharing information for AMD family 15h processors For L1 instruction cache and L2 cache the shared CPU information is wrong. On current AMD family 15h CPUs those caches are shared between both cores of a compute unit. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=42607 Signed-off-by: Andreas Herrmann Cc: Petkov Borislav Cc: Dave Jones Cc: Link: http://lkml.kernel.org/r/20120208195229.GA17523@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 44 ++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e7a901..73d08ed98a64 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { int node; @@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { - struct _cpuid4_info *this_leaf, *sibling_leaf; - unsigned long num_threads_sharing; - int index_msb, i, sibling; + struct _cpuid4_info *this_leaf; + int ret, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { + ret = 0; + if (index == 3) { + ret = 1; for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; @@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) set_bit(sibling, this_leaf->shared_cpu_map); } } - return; + } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { + ret = 1; + for_each_cpu(i, cpu_sibling_mask(cpu)) { + if (!per_cpu(ici_cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } } + + return ret; +} + +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (cache_shared_amd_cpu_map_setup(cpu, index)) + return; + } + this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; -- cgit v1.2.3 From f8d98f1095210da708a59f3a0b6fd267ad8f3f03 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 10 Feb 2012 14:33:40 +0900 Subject: x86: Fix to decode grouped AVX with VEX pp bits Fix to decode grouped AVX with VEX pp bits which should be handled as same as last-prefixes. This fixes below warnings in posttest with CONFIG_CRYPTO_SHA1_SSSE3=y. Warning: arch/x86/tools/test_get_len found difference at :ffffffff810d5fc0 Warning: ffffffff810d6069: c5 f9 73 de 04 vpsrldq $0x4,%xmm6,%xmm0 Warning: objdump says 5 bytes, but insn_get_length() says 4 ... With this change, test_get_len can decode it correctly. $ arch/x86/tools/test_get_len -v -y ffffffff810d6069: c5 f9 73 de 04 vpsrldq $0x4,%xmm6,%xmm0 Succeed: decoded and checked 1 instructions Reported-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Cc: yrl.pp-manager.tt@hitachi.com Link: http://lkml.kernel.org/r/20120210053340.30429.73410.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/include/asm/inat.h | 5 +++-- arch/x86/include/asm/insn.h | 18 ++++++++++++------ arch/x86/lib/inat.c | 36 ++++++++++++++++++------------------ arch/x86/lib/insn.c | 13 +++++++------ 4 files changed, 40 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 205b063e3e32..74a2e312e8a2 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -97,11 +97,12 @@ /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); +extern int inat_get_last_prefix_id(insn_byte_t last_pfx); extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, - insn_byte_t last_pfx, + int lpfx_id, insn_attr_t esc_attr); extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, - insn_byte_t last_pfx, + int lpfx_id, insn_attr_t esc_attr); extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 74df3f1eddfd..48eb30a86062 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -96,12 +96,6 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ -/* The last prefix is needed for two-byte and three-byte opcodes */ -static inline insn_byte_t insn_last_prefix(struct insn *insn) -{ - return insn->prefixes.bytes[3]; -} - extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); @@ -160,6 +154,18 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +/* Get the last prefix id from last prefix or VEX prefix */ +static inline int insn_last_prefix_id(struct insn *insn) +{ + if (insn_is_avx(insn)) + return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + + if (insn->prefixes.bytes[3]) + return inat_get_last_prefix_id(insn->prefixes.bytes[3]); + + return 0; +} + /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 88ad5fbda6e1..c1f01a8e9f65 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -29,46 +29,46 @@ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) return inat_primary_table[opcode]; } -insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx, +int inat_get_last_prefix_id(insn_byte_t last_pfx) +{ + insn_attr_t lpfx_attr; + + lpfx_attr = inat_get_opcode_attribute(last_pfx); + return inat_last_prefix_id(lpfx_attr); +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, insn_attr_t esc_attr) { const insn_attr_t *table; - insn_attr_t lpfx_attr; - int n, m = 0; + int n; n = inat_escape_id(esc_attr); - if (last_pfx) { - lpfx_attr = inat_get_opcode_attribute(last_pfx); - m = inat_last_prefix_id(lpfx_attr); - } + table = inat_escape_tables[n][0]; if (!table) return 0; - if (inat_has_variant(table[opcode]) && m) { - table = inat_escape_tables[n][m]; + if (inat_has_variant(table[opcode]) && lpfx_id) { + table = inat_escape_tables[n][lpfx_id]; if (!table) return 0; } return table[opcode]; } -insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, insn_attr_t grp_attr) { const insn_attr_t *table; - insn_attr_t lpfx_attr; - int n, m = 0; + int n; n = inat_group_id(grp_attr); - if (last_pfx) { - lpfx_attr = inat_get_opcode_attribute(last_pfx); - m = inat_last_prefix_id(lpfx_attr); - } + table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); - if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { - table = inat_group_tables[n][m]; + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { + table = inat_group_tables[n][lpfx_id]; if (!table) return inat_group_common_attribute(grp_attr); } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 5a1f9f3e3fbb..25feb1ae71c5 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -185,7 +185,8 @@ err_out: void insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; - insn_byte_t op, pfx; + insn_byte_t op; + int pfx_id; if (opcode->got) return; if (!insn->prefixes.got) @@ -212,8 +213,8 @@ void insn_get_opcode(struct insn *insn) /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; - pfx = insn_last_prefix(insn); - insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) insn->attr = 0; /* This instruction is bad */ @@ -235,7 +236,7 @@ err_out: void insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; - insn_byte_t pfx, mod; + insn_byte_t pfx_id, mod; if (modrm->got) return; if (!insn->opcode.got) @@ -246,8 +247,8 @@ void insn_get_modrm(struct insn *insn) modrm->value = mod; modrm->nbytes = 1; if (inat_is_group(insn->attr)) { - pfx = insn_last_prefix(insn); - insn->attr = inat_get_group_attribute(mod, pfx, + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) insn->attr = 0; /* This is bad */ -- cgit v1.2.3 From 21c3fcf3e39353d4f21d50e257cc74f3204b1988 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 12 Feb 2012 09:53:57 -0800 Subject: x86/debug: Fix/improve the show_msr= debug print out Found out that show_msr= is broken, when I asked a user to use it to capture debug info about broken MTRR's whose MTRR settings are probably different between CPUs. Only the first CPUs MSRs are printed, but that is not enough to track down the suspected bug. For years we called print_cpu_msr from print_cpu_info(), but this commit: | commit 2eaad1fddd7450a48ad464229775f97fbfe8af36 | Author: Mike Travis | Date: Thu Dec 10 17:19:36 2009 -0800 | | x86: Limit the number of processor bootup messages removed the print_cpu_info() call from all APs. Put it back - it will only print MSRs when the user specifically requests them via show_msr=. Signed-off-by: Yinghai Lu Cc: Mike Travis Link: http://lkml.kernel.org/r/1329069237-11483-1-git-send-email-yinghai@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/kernel/smpboot.c | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..8bb062bbcbec 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -162,6 +162,7 @@ extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); +void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..8b6a3bb57d8e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -933,7 +933,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = { { 0xc0011000, 0xc001103b}, }; -static void __cpuinit print_cpu_msr(void) +static void __cpuinit __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -997,13 +997,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); -#ifdef CONFIG_SMP + __print_cpu_msr(); +} + +void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +{ if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif + __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..257049d7c657 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -791,9 +791,10 @@ do_rest: schedule(); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + print_cpu_msr(&cpu_data(cpu)); pr_debug("CPU%d: has booted.\n", cpu); - else { + } else { boot_error = 1; if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) == 0xA5A5A5A5) -- cgit v1.2.3 From 484546509ce5d49d43ec0a6eb2141c6bf3362bfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Feb 2012 09:40:30 -0500 Subject: x86/tracing: Denote the power and cpuidle tracepoints as _rcuidle() The power and cpuidle tracepoints are called within a rcu_idle_exit() section, and must be denoted with the _rcuidle() version of the tracepoint. Acked-by: Paul E. McKenney Reviewed-by: Josh Triplett Signed-off-by: Steven Rostedt --- arch/x86/kernel/process.c | 24 ++++++++++++------------ include/trace/events/power.h | 2 ++ 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..44eefde92109 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -377,8 +377,8 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle(1, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -391,8 +391,8 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); /* loop is done by the caller */ @@ -450,8 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle(1, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -461,8 +461,8 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else local_irq_enable(); } @@ -474,13 +474,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0, smp_processor_id()); - trace_cpu_idle(0, smp_processor_id()); + trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(smp_processor_id()); - trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } /* diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 1bcc2a8c00e2..14b38940062b 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -151,6 +151,8 @@ enum { events get removed */ static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {}; static inline void trace_power_end(u64 cpuid) {}; +static inline void trace_power_start_rcuidle(u64 type, u64 state, u64 cpuid) {}; +static inline void trace_power_end_rcuidle(u64 cpuid) {}; static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {}; #endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */ -- cgit v1.2.3 From be98c2cdb15ba26148cd2bd58a857d4f7759ed38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Feb 2012 13:47:25 -0800 Subject: i387: math_state_restore() isn't called from asm It was marked asmlinkage for some really old and stale legacy reasons. Fix that and the equally stale comment. Noticed when debugging the irq_fpu_usable() bugs. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 2 +- arch/x86/kernel/traps.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6919e936345b..a5c7ae504176 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,7 +29,7 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern asmlinkage void math_state_restore(void); +extern void math_state_restore(void); extern void __math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 482ec3af2067..982433b5da30 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -599,10 +599,10 @@ void __math_state_restore(void) * Careful.. There are problems with IBM-designed IRQ13 behaviour. * Don't touch unless you *really* know how it works. * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available). */ -asmlinkage void math_state_restore(void) +void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; -- cgit v1.2.3 From 5b1cbac37798805c1fee18c8cebe5c0a13975b17 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Feb 2012 13:56:14 -0800 Subject: i387: make irq_fpu_usable() tests more robust Some code - especially the crypto layer - wants to use the x86 FP/MMX/AVX register set in what may be interrupt (typically softirq) context. That *can* be ok, but the tests for when it was ok were somewhat suspect. We cannot touch the thread-specific status bits either, so we'd better check that we're not going to try to save FP state or anything like that. Now, it may be that the TS bit is always cleared *before* we set the USEDFPU bit (and only set when we had already cleared the USEDFP before), so the TS bit test may actually have been sufficient, but it certainly was not obviously so. So this explicitly verifies that we will not touch the TS_USEDFPU bit, and adds a few related sanity-checks. Because it seems that somehow AES-NI is corrupting user FP state. The cause is not clear, and this patch doesn't fix it, but while debugging it I really wanted the code to be more obviously correct and robust. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 54 ++++++++++++++++++++++++++++++++++++++------- arch/x86/kernel/traps.c | 1 + 2 files changed, 47 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a5c7ae504176..a29571821b99 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -307,9 +307,54 @@ static inline void __clear_fpu(struct task_struct *tsk) } } +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: TS_USEDFPU must be clear (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !(current_thread_info()->status & TS_USEDFPU) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +static inline bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} + static inline void kernel_fpu_begin(void) { struct thread_info *me = current_thread_info(); + + WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); if (me->status & TS_USEDFPU) __save_init_fpu(me->task); @@ -323,14 +368,6 @@ static inline void kernel_fpu_end(void) preempt_enable(); } -static inline bool irq_fpu_usable(void) -{ - struct pt_regs *regs; - - return !in_interrupt() || !(regs = get_irq_regs()) || \ - user_mode(regs) || (read_cr0() & X86_CR0_TS); -} - /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions @@ -367,6 +404,7 @@ static inline void irq_ts_restore(int TS_state) */ static inline void save_init_fpu(struct task_struct *tsk) { + WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU); preempt_disable(); __save_init_fpu(tsk); stts(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 982433b5da30..8ba27dbc107a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -631,6 +631,7 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { + WARN_ON_ONCE(!user_mode_vm(regs)); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; -- cgit v1.2.3 From 70142a9dd154f54f7409871ead86f7d77f2c6576 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 11 Feb 2012 22:56:59 +0000 Subject: x86/cpu: Fix overrun check in arch_print_cpu_modalias() snprintf() does not return a negative value when truncating. Signed-off-by: Ben Hutchings Acked-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 940e2d483076..2dfa52bcdfe2 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -67,7 +67,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev, for (i = 0; i < NCAPINTS*32; i++) { if (boot_cpu_has(i)) { n = snprintf(buf, size, ",%04X", i); - if (n < 0) { + if (n >= size) { WARN(1, "x86 features overflow page\n"); break; } -- cgit v1.2.3 From 5467bdda4a326513c2f14b712a22d59115b7ae94 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 11 Feb 2012 22:57:19 +0000 Subject: x86/cpu: Clean up modalias feature matching We currently include commas on both sides of the feature ID in a modalias, but this prevents the lowest numbered feature of a CPU from being matched. Since all feature IDs have the same length, we do not need to worry about substring matches, so omit commas from the modalias entirely. Avoid generating multiple adjacent wildcards when there is no feature ID to match. Signed-off-by: Ben Hutchings Acked-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/match.c | 3 +-- scripts/mod/file2alias.c | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 2dfa52bcdfe2..5502b289341b 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -63,7 +63,7 @@ ssize_t arch_print_cpu_modalias(struct device *dev, boot_cpu_data.x86_model); size -= n; buf += n; - size -= 2; + size -= 1; for (i = 0; i < NCAPINTS*32; i++) { if (boot_cpu_has(i)) { n = snprintf(buf, size, ",%04X", i); @@ -75,7 +75,6 @@ ssize_t arch_print_cpu_modalias(struct device *dev, buf += n; } } - *buf++ = ','; *buf++ = '\n'; return buf - bufptr; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index a468af059834..78fd81fb9732 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1021,8 +1021,9 @@ static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id, ADD(alias, "vendor:", id->vendor != X86_VENDOR_ANY, id->vendor); ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family); ADD(alias, ":model:", id->model != X86_MODEL_ANY, id->model); - ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature); - strcat(alias, ",*"); + strcat(alias, ":feature:*"); + if (id->feature != X86_FEATURE_ANY) + sprintf(alias + strlen(alias), "%04X*", id->feature); return 1; } ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry); -- cgit v1.2.3 From 8d21190e223a785a351a1078ac6e3700809969b6 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 9 Feb 2012 23:02:16 +0100 Subject: crypto: twofish-x86 - Remove dead code from twofish_glue_3way.c::init() We can never reach the line just after the 'return 0' statement. Remove it. Signed-off-by: Jesper Juhl Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 0afd134d8c9c..2c7f14ec7082 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -706,7 +706,6 @@ int __init init(void) return 0; - crypto_unregister_alg(&blk_xts_alg); blk_xts_err: crypto_unregister_alg(&blk_lrw_alg); blk_lrw_err: -- cgit v1.2.3 From 6e77fe8c1100bfb3c6f5b2558d4556519b837b65 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 9 Feb 2012 23:16:04 +0100 Subject: crypto: serpent-sse2 - remove dead code from serpent_sse2_glue.c::serpent_sse2_init() We cannot reach the line after 'return err'. Remove it. Signed-off-by: Jesper Juhl Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 7955a9b76b91..de81cf4e06a1 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -1025,7 +1025,6 @@ static int __init serpent_sse2_init(void) goto ablk_xts_err; return err; - crypto_unregister_alg(&ablk_xts_alg); ablk_xts_err: crypto_unregister_alg(&blk_xts_alg); blk_xts_err: -- cgit v1.2.3 From 925845bd49c6de437dfab3bf8dc654ea3ae21d74 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Mon, 21 Nov 2011 11:54:13 -0700 Subject: x86/PCI: Infrastructure to maintain a list of FW-assigned BIOS BAR values Commit 58c84eda075 introduced functionality to try and reinstate the original BIOS BAR addresses of a PCI device when normal resource assignment attempts fail. To keep track of the BIOS BAR addresses, struct pci_dev was augmented with an array to hold the BAR addresses of the PCI device: 'resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]'. The reinstatement of BAR addresses is an uncommon event leaving the 'fw_addr' array unused under normal circumstances. This functionality is also currently architecture specific with an implementation limited to x86. As the use of struct pci_dev is so prevalent, having the 'fw_addr' array residing within such seems somewhat wasteful. This patch introduces a stand alone data structure and interfacing routines for maintaining a list of FW-assigned BIOS BAR value entries. Signed-off-by: Myron Stowe Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 80 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 91821a1a0c3a..5a1edf2b5386 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -39,6 +39,85 @@ #include +/* + * This list of dynamic mappings is for temporarily maintaining + * original BIOS BAR addresses for possible reinstatement. + */ +struct pcibios_fwaddrmap { + struct list_head list; + struct pci_dev *dev; + resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; +}; + +static LIST_HEAD(pcibios_fwaddrmappings); +static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock); + +/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */ +static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) +{ + struct pcibios_fwaddrmap *map; + + list_for_each_entry(map, &pcibios_fwaddrmappings, list) + if (map->dev == dev) + return map; + + return NULL; +} + +static void +pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr) +{ + unsigned long flags; + struct pcibios_fwaddrmap *map; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + map = pcibios_fwaddrmap_lookup(dev); + if (!map) { + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return; + + map->dev = pci_dev_get(dev); + map->fw_addr[idx] = fw_addr; + INIT_LIST_HEAD(&map->list); + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + list_add_tail(&map->list, &pcibios_fwaddrmappings); + } else + map->fw_addr[idx] = fw_addr; + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); +} + +resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx) +{ + unsigned long flags; + struct pcibios_fwaddrmap *map; + resource_size_t fw_addr = 0; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + map = pcibios_fwaddrmap_lookup(dev); + if (map) + fw_addr = map->fw_addr[idx]; + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); + + return fw_addr; +} + +static void pcibios_fw_addr_list_del(void) +{ + unsigned long flags; + struct pcibios_fwaddrmap *entry, *next; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) { + list_del(&entry->list); + pci_dev_put(entry->dev); + kfree(entry); + } + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); +} + static int skip_isa_ioresource_align(struct pci_dev *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index a16b1df3deff..8e9a307e58b8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -891,6 +891,7 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void int pci_vpd_truncate(struct pci_dev *dev, size_t size); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ +resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); void pci_bus_assign_resources(const struct pci_bus *bus); void pci_bus_size_bridges(struct pci_bus *bus); int pci_claim_resource(struct pci_dev *, int); -- cgit v1.2.3 From 6535943fbf25c8e9419a6b20ca992633baa0bf99 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Mon, 21 Nov 2011 11:54:19 -0700 Subject: x86/PCI: Convert maintaining FW-assigned BIOS BAR values to use a list This patch converts the underlying maintenance aspects of FW-assigned BIOS BAR values from a statically allocated array within struct pci_dev to a list of temporary, stand alone, entries. Signed-off-by: Myron Stowe Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 4 +++- drivers/pci/setup-res.c | 24 +++++++++++++++++++++--- include/linux/pci.h | 1 - 3 files changed, 24 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5a1edf2b5386..33e6a0b995fc 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -261,7 +261,8 @@ static void __init pcibios_allocate_resources(int pass) idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { /* We'll assign a new address later */ - dev->fw_addr[idx] = r->start; + pcibios_save_fw_addr(dev, + idx, r->start); r->end -= r->start; r->start = 0; } @@ -307,6 +308,7 @@ static int __init pcibios_assign_resources(void) } pci_assign_unassigned_resources(); + pcibios_fw_addr_list_del(); return 0; } diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 3cf47d34becf..85c8470c35e2 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -158,16 +158,34 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, return ret; } +/* + * Generic function that returns a value indicating that the device's + * original BIOS BAR address was not saved and so is not available for + * reinstatement. + * + * Can be over-ridden by architecture specific code that implements + * reinstatement functionality rather than leaving it disabled when + * normal allocation attempts fail. + */ +resource_size_t __weak pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx) +{ + return 0; +} + static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev, int resno, resource_size_t size) { struct resource *root, *conflict; - resource_size_t start, end; + resource_size_t fw_addr, start, end; int ret = 0; + fw_addr = pcibios_retrieve_fw_addr(dev, resno); + if (!fw_addr) + return 1; + start = res->start; end = res->end; - res->start = dev->fw_addr[resno]; + res->start = fw_addr; res->end = res->start + size - 1; root = pci_find_parent_resource(dev, res); @@ -271,7 +289,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno) * where firmware left it. That at least has a chance of * working, which is better than just leaving it disabled. */ - if (ret < 0 && dev->fw_addr[resno]) + if (ret < 0) ret = pci_revert_fw_address(res, dev, resno, size); if (!ret) { diff --git a/include/linux/pci.h b/include/linux/pci.h index 8e9a307e58b8..4afabb1d2d27 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -299,7 +299,6 @@ struct pci_dev { */ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ - resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; /* FW-assigned addr */ /* These fields are used by common fixups */ unsigned int transparent:1; /* Transparent PCI bridge */ -- cgit v1.2.3 From 316d86fe8641abfad32702c77d9e62cf19e68b00 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Jan 2012 17:41:21 -0700 Subject: x86/PCI: don't fall back to defaults if _CRS has no apertures Host bridges that lead to things like the Uncore need not have any I/O port or MMIO apertures. For example, in this case: ACPI: PCI Root Bridge [UNC1] (domain 0000 [bus ff]) PCI: root bus ff: using default resources PCI host bridge to bus 0000:ff pci_bus 0000:ff: root bus resource [io 0x0000-0xffff] pci_bus 0000:ff: root bus resource [mem 0x00000000-0x3fffffffffff] we should not pretend those default resources are available on bus ff. CC: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a312e76063a7..daa42490c1d9 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -404,7 +404,12 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) kfree(sd); } else { get_current_resources(device, busnum, domain, &resources); - if (list_empty(&resources)) + + /* + * _CRS with no apertures is normal, so only fall back to + * defaults or native bridge info if we're ignoring _CRS. + */ + if (!pci_use_crs) x86_pci_root_bus_resources(busnum, &resources); bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); -- cgit v1.2.3 From 07d620212d51d113fad997357a75f5e1f2ffd5a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 7 Feb 2012 21:09:03 -0800 Subject: x86: Use generic posix_types.h Change the x86 architecture to use . Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1328677745-20121-20-git-send-email-hpa@zytor.com Cc: Ingo Molnar Cc: Thomas Gleixner --- arch/x86/include/asm/posix_types_32.h | 75 +++--------------------- arch/x86/include/asm/posix_types_64.h | 106 +--------------------------------- 2 files changed, 12 insertions(+), 169 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h index f7d9adf82e53..99f262e04b91 100644 --- a/arch/x86/include/asm/posix_types_32.h +++ b/arch/x86/include/asm/posix_types_32.h @@ -7,79 +7,22 @@ * assume GCC is being used. */ -typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; +#define __kernel_mode_t __kernel_mode_t + typedef unsigned short __kernel_nlink_t; -typedef long __kernel_off_t; -typedef int __kernel_pid_t; +#define __kernel_nlink_t __kernel_nlink_t + typedef unsigned short __kernel_ipc_pid_t; +#define __kernel_ipc_pid_t __kernel_ipc_pid_t + typedef unsigned short __kernel_uid_t; typedef unsigned short __kernel_gid_t; -typedef unsigned int __kernel_size_t; -typedef int __kernel_ssize_t; -typedef int __kernel_ptrdiff_t; -typedef long __kernel_time_t; -typedef long __kernel_suseconds_t; -typedef long __kernel_clock_t; -typedef int __kernel_timer_t; -typedef int __kernel_clockid_t; -typedef int __kernel_daddr_t; -typedef char * __kernel_caddr_t; -typedef unsigned short __kernel_uid16_t; -typedef unsigned short __kernel_gid16_t; -typedef unsigned int __kernel_uid32_t; -typedef unsigned int __kernel_gid32_t; +#define __kernel_uid_t __kernel_uid_t -typedef unsigned short __kernel_old_uid_t; -typedef unsigned short __kernel_old_gid_t; typedef unsigned short __kernel_old_dev_t; +#define __kernel_old_dev_t __kernel_old_dev_t -#ifdef __GNUC__ -typedef long long __kernel_loff_t; -#endif - -typedef struct { - int val[2]; -} __kernel_fsid_t; - -#if defined(__KERNEL__) - -#undef __FD_SET -#define __FD_SET(fd,fdsetp) \ - asm volatile("btsl %1,%0": \ - "+m" (*(__kernel_fd_set *)(fdsetp)) \ - : "r" ((int)(fd))) - -#undef __FD_CLR -#define __FD_CLR(fd,fdsetp) \ - asm volatile("btrl %1,%0": \ - "+m" (*(__kernel_fd_set *)(fdsetp)) \ - : "r" ((int) (fd))) - -#undef __FD_ISSET -#define __FD_ISSET(fd,fdsetp) \ - (__extension__ \ - ({ \ - unsigned char __result; \ - asm volatile("btl %1,%2 ; setb %0" \ - : "=q" (__result) \ - : "r" ((int)(fd)), \ - "m" (*(__kernel_fd_set *)(fdsetp))); \ - __result; \ -})) - -#undef __FD_ZERO -#define __FD_ZERO(fdsetp) \ -do { \ - int __d0, __d1; \ - asm volatile("cld ; rep ; stosl" \ - : "=m" (*(__kernel_fd_set *)(fdsetp)), \ - "=&c" (__d0), "=&D" (__d1) \ - : "a" (0), "1" (__FDSET_LONGS), \ - "2" ((__kernel_fd_set *)(fdsetp)) \ - : "memory"); \ -} while (0) - -#endif /* defined(__KERNEL__) */ +#include #endif /* _ASM_X86_POSIX_TYPES_32_H */ diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h index eb8d2d92b63e..cba0c1ead162 100644 --- a/arch/x86/include/asm/posix_types_64.h +++ b/arch/x86/include/asm/posix_types_64.h @@ -7,113 +7,13 @@ * assume GCC is being used. */ -typedef unsigned long __kernel_ino_t; -typedef unsigned int __kernel_mode_t; -typedef unsigned long __kernel_nlink_t; -typedef long __kernel_off_t; -typedef int __kernel_pid_t; -typedef int __kernel_ipc_pid_t; -typedef unsigned int __kernel_uid_t; -typedef unsigned int __kernel_gid_t; -typedef unsigned long __kernel_size_t; -typedef long __kernel_ssize_t; -typedef long __kernel_ptrdiff_t; -typedef long __kernel_time_t; -typedef long __kernel_suseconds_t; -typedef long __kernel_clock_t; -typedef int __kernel_timer_t; -typedef int __kernel_clockid_t; -typedef int __kernel_daddr_t; -typedef char * __kernel_caddr_t; -typedef unsigned short __kernel_uid16_t; -typedef unsigned short __kernel_gid16_t; - -#ifdef __GNUC__ -typedef long long __kernel_loff_t; -#endif - -typedef struct { - int val[2]; -} __kernel_fsid_t; - typedef unsigned short __kernel_old_uid_t; typedef unsigned short __kernel_old_gid_t; -typedef __kernel_uid_t __kernel_uid32_t; -typedef __kernel_gid_t __kernel_gid32_t; +#define __kernel_old_uid_t __kernel_old_uid_t typedef unsigned long __kernel_old_dev_t; +#define __kernel_old_dev_t __kernel_old_dev_t -#ifdef __KERNEL__ - -#undef __FD_SET -static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp) -{ - unsigned long _tmp = fd / __NFDBITS; - unsigned long _rem = fd % __NFDBITS; - fdsetp->fds_bits[_tmp] |= (1UL<<_rem); -} - -#undef __FD_CLR -static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp) -{ - unsigned long _tmp = fd / __NFDBITS; - unsigned long _rem = fd % __NFDBITS; - fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem); -} - -#undef __FD_ISSET -static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p) -{ - unsigned long _tmp = fd / __NFDBITS; - unsigned long _rem = fd % __NFDBITS; - return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0; -} - -/* - * This will unroll the loop for the normal constant cases (8 or 32 longs, - * for 256 and 1024-bit fd_sets respectively) - */ -#undef __FD_ZERO -static inline void __FD_ZERO(__kernel_fd_set *p) -{ - unsigned long *tmp = p->fds_bits; - int i; - - if (__builtin_constant_p(__FDSET_LONGS)) { - switch (__FDSET_LONGS) { - case 32: - tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; - tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; - tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0; - tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0; - tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0; - tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0; - tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0; - tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0; - return; - case 16: - tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; - tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; - tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0; - tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0; - return; - case 8: - tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; - tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0; - return; - case 4: - tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0; - return; - } - } - i = __FDSET_LONGS; - while (i) { - i--; - *tmp = 0; - tmp++; - } -} - -#endif /* defined(__KERNEL__) */ +#include #endif /* _ASM_X86_POSIX_TYPES_64_H */ -- cgit v1.2.3 From c38e23456278e967f094b08247ffc3711b1029b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 15 Feb 2012 08:05:18 -0800 Subject: i387: fix sense of sanity check The check for save_init_fpu() (introduced in commit 5b1cbac37798: "i387: make irq_fpu_usable() tests more robust") was the wrong way around, but I hadn't noticed, because my "tests" were bogus: the FPU exceptions are disabled by default, so even doing a divide by zero never actually triggers this code at all unless you do extra work to enable them. So if anybody did enable them, they'd get one spurious warning. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a29571821b99..727c1dd84899 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -404,7 +404,7 @@ static inline void irq_ts_restore(int TS_state) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU); + WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); preempt_disable(); __save_init_fpu(tsk); stts(); -- cgit v1.2.3 From 15d8791cae75dca27bfda8ecfe87dca9379d6bb0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 09:15:04 -0800 Subject: i387: fix x86-64 preemption-unsafe user stack save/restore Commit 5b1cbac37798 ("i387: make irq_fpu_usable() tests more robust") added a sanity check to the #NM handler to verify that we never cause the "Device Not Available" exception in kernel mode. However, that check actually pinpointed a (fundamental) race where we do cause that exception as part of the signal stack FPU state save/restore code. Because we use the floating point instructions themselves to save and restore state directly from user mode, we cannot do that atomically with testing the TS_USEDFPU bit: the user mode access itself may cause a page fault, which causes a task switch, which saves and restores the FP/MMX state from the kernel buffers. This kind of "recursive" FP state save is fine per se, but it means that when the signal stack save/restore gets restarted, it will now take the '#NM' exception we originally tried to avoid. With preemption this can happen even without the page fault - but because of the user access, we cannot just disable preemption around the save/restore instruction. There are various ways to solve this, including using the "enable/disable_page_fault()" helpers to not allow page faults at all during the sequence, and fall back to copying things by hand without the use of the native FP state save/restore instructions. However, the simplest thing to do is to just allow the #NM from kernel space, but fix the race in setting and clearing CR0.TS that this all exposed: the TS bit changes and the TS_USEDFPU bit absolutely have to be atomic wrt scheduling, so while the actual state save/restore can be interrupted and restarted, the act of actually clearing/setting CR0.TS and the TS_USEDFPU bit together must not. Instead of just adding random "preempt_disable/enable()" calls to what is already excessively ugly code, this introduces some helper functions that mostly mirror the "kernel_fpu_begin/end()" functionality, just for the user state instead. Those helper functions should probably eventually replace the other ad-hoc CR0.TS and TS_USEDFPU tests too, but I'll need to think about it some more: the task switching functionality in particular needs to expose the difference between the 'prev' and 'next' threads, while the new helper functions intentionally were written to only work with 'current'. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 1 - arch/x86/kernel/xsave.c | 10 +++------- 3 files changed, 45 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 727c1dd84899..f704be239883 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -399,6 +399,48 @@ static inline void irq_ts_restore(int TS_state) stts(); } +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + * + * The actual user_fpu_begin/end() functions + * need to be preemption-safe, though. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline int user_has_fpu(void) +{ + return current_thread_info()->status & TS_USEDFPU; +} + +static inline void user_fpu_end(void) +{ + preempt_disable(); + current_thread_info()->status &= ~TS_USEDFPU; + stts(); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) { + clts(); + current_thread_info()->status |= TS_USEDFPU; + } + preempt_enable(); +} + /* * These disable preemption on their own and are safe */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8ba27dbc107a..982433b5da30 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -631,7 +631,6 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { - WARN_ON_ONCE(!user_mode_vm(regs)); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976b..86f1f09a738a 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf) if (!used_math()) return 0; - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (user_has_fpu()) { if (use_xsave()) err = xsave_user(buf); else @@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf) if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + user_fpu_end(); } else { sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, @@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf) return err; } - if (!(task_thread_info(current)->status & TS_USEDFPU)) { - clts(); - task_thread_info(current)->status |= TS_USEDFPU; - } + user_fpu_begin(); if (use_xsave()) err = restore_user_xstate(buf); else -- cgit v1.2.3 From b6c66418dcad0fcf83cd1d0a39482db37bf4fc41 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 12:22:48 -0800 Subject: i387: move TS_USEDFPU clearing out of __save_init_fpu and into callers Touching TS_USEDFPU without touching CR0.TS is confusing, so don't do it. By moving it into the callers, we always do the TS_USEDFPU next to the CR0.TS accesses in the source code, and it's much easier to see how the two go hand in hand. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index f704be239883..1e12c2d087e4 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -259,7 +259,6 @@ static inline void fpu_save_init(struct fpu *fpu) static inline void __save_init_fpu(struct task_struct *tsk) { fpu_save_init(&tsk->thread.fpu); - task_thread_info(tsk)->status &= ~TS_USEDFPU; } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -290,6 +289,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk) { if (task_thread_info(tsk)->status & TS_USEDFPU) { __save_init_fpu(tsk); + task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else tsk->fpu_counter = 0; @@ -356,9 +356,11 @@ static inline void kernel_fpu_begin(void) WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) + if (me->status & TS_USEDFPU) { __save_init_fpu(me->task); - else + me->status &= ~TS_USEDFPU; + /* We do 'stts()' in kernel_fpu_end() */ + } else clts(); } @@ -449,6 +451,7 @@ static inline void save_init_fpu(struct task_struct *tsk) WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); preempt_disable(); __save_init_fpu(tsk); + task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); preempt_enable(); } -- cgit v1.2.3 From 6d59d7a9f5b723a7ac1925c136e93ec83c0c3043 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 13:33:12 -0800 Subject: i387: don't ever touch TS_USEDFPU directly, use helper functions This creates three helper functions that do the TS_USEDFPU accesses, and makes everybody that used to do it by hand use those helpers instead. In addition, there's a couple of helper functions for the "change both CR0.TS and TS_USEDFPU at the same time" case, and the places that do that together have been changed to use those. That means that we have fewer random places that open-code this situation. The intent is partly to clarify the code without actually changing any semantics yet (since we clearly still have some hard to reproduce bug in this area), but also to make it much easier to use another approach entirely to caching the CR0.TS bit for software accesses. Right now we use a bit in the thread-info 'status' variable (this patch does not change that), but we might want to make it a full field of its own or even make it a per-cpu variable. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 75 +++++++++++++++++++++++++++++++++------------ arch/x86/kernel/traps.c | 2 +- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 4 files changed, 58 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 1e12c2d087e4..548b2c07ac9a 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -279,6 +279,47 @@ static inline int restore_fpu_checking(struct task_struct *tsk) return fpu_restore_checking(&tsk->thread.fpu); } +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct thread_info *ti) +{ + return ti->status & TS_USEDFPU; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct thread_info *ti) +{ + ti->status &= ~TS_USEDFPU; +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct thread_info *ti) +{ + ti->status |= TS_USEDFPU; +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct thread_info *ti) +{ + __thread_clear_has_fpu(ti); + stts(); +} + +static inline void __thread_fpu_begin(struct thread_info *ti) +{ + clts(); + __thread_set_has_fpu(ti); +} + /* * Signal frame handlers... */ @@ -287,23 +328,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } } @@ -311,14 +350,14 @@ static inline void __clear_fpu(struct task_struct *tsk) * Were we in an interrupt that interrupted kernel mode? * * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: TS_USEDFPU must be clear (so + * pair does nothing at all: the thread must not have fpu (so * that we don't try to save the FPU state), and TS must * be set (so that the clts/stts pair does nothing that is * visible in the interrupted kernel thread). */ static inline bool interrupted_kernel_fpu_idle(void) { - return !(current_thread_info()->status & TS_USEDFPU) && + return !__thread_has_fpu(current_thread_info()) && (read_cr0() & X86_CR0_TS); } @@ -356,9 +395,9 @@ static inline void kernel_fpu_begin(void) WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) { + if (__thread_has_fpu(me)) { __save_init_fpu(me->task); - me->status &= ~TS_USEDFPU; + __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else clts(); @@ -422,24 +461,21 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return current_thread_info()->status & TS_USEDFPU; + return __thread_has_fpu(current_thread_info()); } static inline void user_fpu_end(void) { preempt_disable(); - current_thread_info()->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(current_thread_info()); preempt_enable(); } static inline void user_fpu_begin(void) { preempt_disable(); - if (!user_has_fpu()) { - clts(); - current_thread_info()->status |= TS_USEDFPU; - } + if (!user_has_fpu()) + __thread_fpu_begin(current_thread_info()); preempt_enable(); } @@ -448,11 +484,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); + WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); preempt_disable(); __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); preempt_enable(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 982433b5da30..fc676e44c77f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -588,7 +588,7 @@ void __math_state_restore(void) return; } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + __thread_set_has_fpu(thread); /* clts in caller! */ tsk->fpu_counter++; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 86f1f09a738a..a0bcd0dbc951 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + BUG_ON(__thread_has_fpu(task_thread_info(tsk))); xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d29216c462b3..36091dd04b4b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (current_thread_info()->status & TS_USEDFPU) + if (__thread_has_fpu(current_thread_info())) clts(); load_gdt(&__get_cpu_var(host_gdt)); } -- cgit v1.2.3 From b3b0870ef3ffed72b92415423da864f440f57ad6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 15:45:23 -0800 Subject: i387: do not preload FPU state at task switch time Yes, taking the trap to re-load the FPU/MMX state is expensive, but so is spending several days looking for a bug in the state save/restore code. And the preload code has some rather subtle interactions with both paravirtualization support and segment state restore, so it's not nearly as simple as it should be. Also, now that we no longer necessarily depend on a single bit (ie TS_USEDFPU) for keeping track of the state of the FPU, we migth be able to do better. If we are really switching between two processes that keep touching the FP state, save/restore is inevitable, but in the case of having one process that does most of the FPU usage, we may actually be able to do much better than the preloading. In particular, we may be able to keep track of which CPU the process ran on last, and also per CPU keep track of which process' FP state that CPU has. For modern CPU's that don't destroy the FPU contents on save time, that would allow us to do a lazy restore by just re-enabling the existing FPU state - with no restore cost at all! Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 1 - arch/x86/kernel/process_32.c | 20 -------------------- arch/x86/kernel/process_64.c | 23 ----------------------- arch/x86/kernel/traps.c | 35 +++++++++++------------------------ 4 files changed, 11 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 548b2c07ac9a..86974c72d0d0 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -30,7 +30,6 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); -extern void __math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cda..324cd722b447 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -299,23 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); - bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - __unlazy_fpu(prev_p); - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); - /* * Reload esp0. */ @@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); - /* If we're going to preload the fpu context, make sure clts - is run while we're batching the cpu state updates. */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -368,9 +351,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - if (preload_fpu) - __math_state_restore(); - /* * Restore %gs if needed (which is common) */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..992b4e542bc3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,18 +386,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; - bool preload_fpu; - - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); /* * Reload esp0, LDT and the page table pointer: @@ -430,10 +418,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ __unlazy_fpu(prev_p); - /* Make sure cpu is ready for new context */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -492,13 +476,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* - * Preload the FPU context, now that we've determined that the - * task is likely to be using it. - */ - if (preload_fpu) - __math_state_restore(); - return prev_p; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fc676e44c77f..5afe824c66e5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,28 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -/* - * __math_state_restore assumes that cr0.TS is already clear and the - * fpu state is all ready for use. Used during context switch. - */ -void __math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - - __thread_set_has_fpu(thread); /* clts in caller! */ - tsk->fpu_counter++; -} - /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -622,9 +600,18 @@ void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + __thread_fpu_begin(thread); - __math_state_restore(); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(thread); + force_sig(SIGSEGV, tsk); + return; + } + + tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3 From 4903062b5485f0e2c286a23b44c9b59d9b017d53 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 19:11:15 -0800 Subject: i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to restore The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. In order to not leak FIP state from one process to another, we need to do a floating point load after the fxsave of the old process, and before the fxrstor of the new FPU state. That resets the state to the (uninteresting) kernel load, rather than some potentially sensitive user information. We used to do this directly after the FPU state save, but that is actually very inconvenient, since it (a) corrupts what is potentially perfectly good FPU state that we might want to lazy avoid restoring later and (b) on x86-64 it resulted in a very annoying ordering constraint, where "__unlazy_fpu()" in the task switch needs to be delayed until after the DS segment has been reloaded just to get the new DS value. Coupling it to the fxrstor instead of the fxsave automatically avoids both of these issues, and also ensures that we only do it when actually necessary (the FP state after a save may never actually get used). It's simply a much more natural place for the leaked state cleanup. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 19 ------------------- arch/x86/kernel/process_64.c | 5 ++--- arch/x86/kernel/traps.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 86974c72d0d0..01b115d86770 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -211,15 +211,6 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ -/* We need a safe address that is cheap to find and that is already - in L1 during context switch. The best choices are unfortunately - different for UP and SMP */ -#ifdef CONFIG_SMP -#define safe_address (__per_cpu_offset[0]) -#else -#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) -#endif - /* * These must be called with preempt disabled */ @@ -243,16 +234,6 @@ static inline void fpu_save_init(struct fpu *fpu) if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) asm volatile("fnclex"); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); } static inline void __save_init_fpu(struct task_struct *tsk) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 992b4e542bc3..753e803f7197 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -387,6 +387,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + __unlazy_fpu(prev_p); + /* * Reload esp0, LDT and the page table pointer: */ @@ -415,9 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); - /* Must be after DS reload */ - __unlazy_fpu(prev_p); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5afe824c66e5..4d42300dcd2c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -585,6 +585,10 @@ void math_state_restore(void) struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; + /* We need a safe address that is cheap to find and that is already + in L1. We just brought in "thread->task", so use that */ +#define safe_address (thread->task) + if (!tsk_used_math(tsk)) { local_irq_enable(); /* @@ -602,6 +606,16 @@ void math_state_restore(void) __thread_fpu_begin(thread); + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); + /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ -- cgit v1.2.3 From f94edacf998516ac9d849f7bc6949a703977a7f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 17 Feb 2012 21:48:54 -0800 Subject: i387: move TS_USEDFPU flag from thread_info to task_struct This moves the bit that indicates whether a thread has ownership of the FPU from the TS_USEDFPU bit in thread_info->status to a word of its own (called 'has_fpu') in task_struct->thread.has_fpu. This fixes two independent bugs at the same time: - changing 'thread_info->status' from the scheduler causes nasty problems for the other users of that variable, since it is defined to be thread-synchronous (that's what the "TS_" part of the naming was supposed to indicate). So perfectly valid code could (and did) do ti->status |= TS_RESTORE_SIGMASK; and the compiler was free to do that as separate load, or and store instructions. Which can cause problems with preemption, since a task switch could happen in between, and change the TS_USEDFPU bit. The change to TS_USEDFPU would be overwritten by the final store. In practice, this seldom happened, though, because the 'status' field was seldom used more than once, so gcc would generally tend to generate code that used a read-modify-write instruction and thus happened to avoid this problem - RMW instructions are naturally low fat and preemption-safe. - On x86-32, the current_thread_info() pointer would, during interrupts and softirqs, point to a *copy* of the real thread_info, because x86-32 uses %esp to calculate the thread_info address, and thus the separate irq (and softirq) stacks would cause these kinds of odd thread_info copy aliases. This is normally not a problem, since interrupts aren't supposed to look at thread information anyway (what thread is running at interrupt time really isn't very well-defined), but it confused the heck out of irq_fpu_usable() and the code that tried to squirrel away the FPU state. (It also caused untold confusion for us poor kernel developers). It also turns out that using 'task_struct' is actually much more natural for most of the call sites that care about the FPU state, since they tend to work with the task struct for other reasons anyway (ie scheduling). And the FPU data that we are going to save/restore is found there too. Thanks to Arjan Van De Ven for pointing us to the %esp issue. Cc: Arjan van de Ven Reported-and-tested-by: Raphael Prevost Acked-and-tested-by: Suresh Siddha Tested-by: Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 44 +++++++++++++++++++------------------- arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/traps.c | 11 +++++----- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 6 files changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 01b115d86770..f5376676f89c 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -264,21 +264,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) * be preemption protection *and* they need to be * properly paired with the CR0.TS changes! */ -static inline int __thread_has_fpu(struct thread_info *ti) +static inline int __thread_has_fpu(struct task_struct *tsk) { - return ti->status & TS_USEDFPU; + return tsk->thread.has_fpu; } /* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct thread_info *ti) +static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - ti->status &= ~TS_USEDFPU; + tsk->thread.has_fpu = 0; } /* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct thread_info *ti) +static inline void __thread_set_has_fpu(struct task_struct *tsk) { - ti->status |= TS_USEDFPU; + tsk->thread.has_fpu = 1; } /* @@ -288,16 +288,16 @@ static inline void __thread_set_has_fpu(struct thread_info *ti) * These generally need preemption protection to work, * do try to avoid using these on their own. */ -static inline void __thread_fpu_end(struct thread_info *ti) +static inline void __thread_fpu_end(struct task_struct *tsk) { - __thread_clear_has_fpu(ti); + __thread_clear_has_fpu(tsk); stts(); } -static inline void __thread_fpu_begin(struct thread_info *ti) +static inline void __thread_fpu_begin(struct task_struct *tsk) { clts(); - __thread_set_has_fpu(ti); + __thread_set_has_fpu(tsk); } /* @@ -308,21 +308,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } } @@ -337,7 +337,7 @@ static inline void __clear_fpu(struct task_struct *tsk) */ static inline bool interrupted_kernel_fpu_idle(void) { - return !__thread_has_fpu(current_thread_info()) && + return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); } @@ -371,12 +371,12 @@ static inline bool irq_fpu_usable(void) static inline void kernel_fpu_begin(void) { - struct thread_info *me = current_thread_info(); + struct task_struct *me = current; WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); if (__thread_has_fpu(me)) { - __save_init_fpu(me->task); + __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else @@ -441,13 +441,13 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current_thread_info()); + return __thread_has_fpu(current); } static inline void user_fpu_end(void) { preempt_disable(); - __thread_fpu_end(current_thread_info()); + __thread_fpu_end(current); preempt_enable(); } @@ -455,7 +455,7 @@ static inline void user_fpu_begin(void) { preempt_disable(); if (!user_has_fpu()) - __thread_fpu_begin(current_thread_info()); + __thread_fpu_begin(current); preempt_enable(); } @@ -464,10 +464,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); + WARN_ON_ONCE(!__thread_has_fpu(tsk)); preempt_disable(); __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); preempt_enable(); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..f7c89e231c6c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,6 +454,7 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ + unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b443..cfd8144d5527 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -247,8 +247,6 @@ static inline struct thread_info *current_thread_info(void) * ever touches our thread-synchronous status, so we don't * have to worry about atomic accesses. */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* idle task polling need_resched, skip sending interrupt */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4d42300dcd2c..ad25e51f40c4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -582,12 +582,11 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) */ void math_state_restore(void) { - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; /* We need a safe address that is cheap to find and that is already - in L1. We just brought in "thread->task", so use that */ -#define safe_address (thread->task) + in L1. We're just bringing in "tsk->thread.has_fpu", so use that */ +#define safe_address (tsk->thread.has_fpu) if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -604,7 +603,7 @@ void math_state_restore(void) local_irq_disable(); } - __thread_fpu_begin(thread); + __thread_fpu_begin(tsk); /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed @@ -620,7 +619,7 @@ void math_state_restore(void) * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(thread); + __thread_fpu_end(tsk); force_sig(SIGSEGV, tsk); return; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a0bcd0dbc951..711091114119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(__thread_has_fpu(task_thread_info(tsk))); + BUG_ON(__thread_has_fpu(tsk)); xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 36091dd04b4b..3b4c8d8ad906 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current_thread_info())) + if (__thread_has_fpu(current)) clts(); load_gdt(&__get_cpu_var(host_gdt)); } -- cgit v1.2.3 From 34ddc81a230b15c0e345b6b253049db731499f7e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 18 Feb 2012 12:56:35 -0800 Subject: i387: re-introduce FPU state preloading at context switch time After all the FPU state cleanups and finally finding the problem that caused all our FPU save/restore problems, this re-introduces the preloading of FPU state that was removed in commit b3b0870ef3ff ("i387: do not preload FPU state at task switch time"). However, instead of simply reverting the removal, this reimplements preloading with several fixes, most notably - properly abstracted as a true FPU state switch, rather than as open-coded save and restore with various hacks. In particular, implementing it as a proper FPU state switch allows us to optimize the CR0.TS flag accesses: there is no reason to set the TS bit only to then almost immediately clear it again. CR0 accesses are quite slow and expensive, don't flip the bit back and forth for no good reason. - Make sure that the same model works for both x86-32 and x86-64, so that there are no gratuitous differences between the two due to the way they save and restore segment state differently due to architectural differences that really don't matter to the FPU state. - Avoid exposing the "preload" state to the context switch routines, and in particular allow the concept of lazy state restore: if nothing else has used the FPU in the meantime, and the process is still on the same CPU, we can avoid restoring state from memory entirely, just re-expose the state that is still in the FPU unit. That optimized lazy restore isn't actually implemented here, but the infrastructure is set up for it. Of course, older CPU's that use 'fnsave' to save the state cannot take advantage of this, since the state saving also trashes the state. In other words, there is now an actual _design_ to the FPU state saving, rather than just random historical baggage. Hopefully it's easier to follow as a result. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 110 ++++++++++++++++++++++++++++++++++++------- arch/x86/kernel/process_32.c | 5 +- arch/x86/kernel/process_64.c | 5 +- arch/x86/kernel/traps.c | 55 +++++++++++++--------- 4 files changed, 133 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index f5376676f89c..a850b4d8d14d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); +extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ /* - * These must be called with preempt disabled + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. */ -static inline void fpu_save_init(struct fpu *fpu) +static inline int fpu_save_init(struct fpu *fpu) { if (use_xsave()) { fpu_xsave(fpu); @@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu) * xsave header may indicate the init state of the FP. */ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return; + return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); } else { asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); - return; + return 0; } - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); + return 0; + } + return 1; } -static inline void __save_init_fpu(struct task_struct *tsk) +static inline int __save_init_fpu(struct task_struct *tsk) { - fpu_save_init(&tsk->thread.fpu); + return fpu_save_init(&tsk->thread.fpu); } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) } /* - * Signal frame handlers... + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +#define fpu_lazy_restore(tsk) (0) +#define fpu_lazy_state_intact(tsk) do { } while (0) + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (__save_init_fpu(old)) + fpu_lazy_state_intact(old); + __thread_clear_has_fpu(old); + old->fpu_counter++; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + if (fpu.preload) { + if (fpu_lazy_restore(new)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} -static inline void __unlazy_fpu(struct task_struct *tsk) +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; + if (fpu.preload) + __math_state_restore(new); } +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + static inline void __clear_fpu(struct task_struct *tsk) { if (__thread_has_fpu(tsk)) { @@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk) static inline void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); - __unlazy_fpu(tsk); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; preempt_enable(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 324cd722b447..80bfe1ab0031 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -299,10 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + fpu = switch_fpu_prepare(prev_p, next_p); /* * Reload esp0. @@ -357,6 +358,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); + switch_fpu_finish(next_p, fpu); + percpu_write(current_task, next_p); return prev_p; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 753e803f7197..1fd94bc4279d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,8 +386,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + fpu_switch_t fpu; - __unlazy_fpu(prev_p); + fpu = switch_fpu_prepare(prev_p, next_p); /* * Reload esp0, LDT and the page table pointer: @@ -457,6 +458,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; + switch_fpu_finish(next_p, fpu); + /* * Switch the PDA and FPU contexts. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ad25e51f40c4..77da5b475ad2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,6 +570,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } +/* + * This gets called with the process already owning the + * FPU state, and with CR0.TS cleared. It just needs to + * restore the FPU register state. + */ +void __math_state_restore(struct task_struct *tsk) +{ + /* We need a safe address that is cheap to find and that is already + in L1. We've just brought in "tsk->thread.has_fpu", so use that */ +#define safe_address (tsk->thread.has_fpu) + + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(tsk); + force_sig(SIGSEGV, tsk); + return; + } +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -584,10 +615,6 @@ void math_state_restore(void) { struct task_struct *tsk = current; - /* We need a safe address that is cheap to find and that is already - in L1. We're just bringing in "tsk->thread.has_fpu", so use that */ -#define safe_address (tsk->thread.has_fpu) - if (!tsk_used_math(tsk)) { local_irq_enable(); /* @@ -604,25 +631,7 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); - force_sig(SIGSEGV, tsk); - return; - } + __math_state_restore(tsk); tsk->fpu_counter++; } -- cgit v1.2.3 From b0deca2e0270135f797e81bdb0743e50fd1dc58d Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 17 Feb 2012 08:16:41 -0600 Subject: x86/UV: Lower UV rtc clocksource rating Lower the rating of the UV rtc clocksource to just below that of the tsc, to improve performance. Reading the tsc clocksource has lower latency than reading the rtc, so favor it in situations where it is synchronized and stable. When the tsc is unsynchronized, the rtc needs to be the chosen clocksource. Signed-off-by: Dimitri Sivanich Cc: John Stultz Cc: Jack Steiner Link: http://lkml.kernel.org/r/20120217141641.GA28063@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/uv_time.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9f29a01ee1b3..5032e0d19b86 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -37,7 +37,7 @@ static void uv_rtc_timer_setup(enum clock_event_mode, static struct clocksource clocksource_uv = { .name = RTC_NAME, - .rating = 400, + .rating = 299, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -379,10 +379,6 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - /* If single blade, prefer tsc */ - if (uv_num_possible_blades() == 1) - clocksource_uv.rating = 250; - rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); -- cgit v1.2.3 From 45d5a1683c04be28abdf5c04c27b1417e0374486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 19 Feb 2012 16:43:37 -0500 Subject: x86/nmi: Test saved %cs in NMI to determine nested NMI case Currently, the NMI handler tests if it is nested by checking the special variable saved on the stack (set during NMI handling) and whether the saved stack is the NMI stack as well (to prevent the race when the variable is set to zero). But userspace may set their %rsp to any value as long as they do not derefence it, and it may make it point to the NMI stack, which will prevent NMIs from triggering while the userspace app is running. (I tested this, and it is indeed the case) Add another check to determine nested NMIs by looking at the saved %cs (code segment register) and making sure that it is the kernel code segment. Signed-off-by: Steven Rostedt Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1329687817.1561.27.camel@acer.local.home Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..debd851de6ff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1531,6 +1531,13 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmp $__KERNEL_CS, 16(%rsp) + jne first_nmi + /* * Check the special variable on the stack to see if NMIs are * executing. -- cgit v1.2.3 From 986cb48c5a4de0085db94d343b4e7dcf54355ec1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 11:46:36 -0800 Subject: x86-32/irq: Don't switch to irq stack for a user-mode irq If the irq happens in user mode, our kernel stack is empty (apart from the pt_regs themselves, of course), so there's no need or advantage to switch. And it really doesn't save any stack space, quite the reverse: it means that a nested interrupt cannot switch irq stacks. So instead of saving kernel stack space, it actually causes the potential for *more* stack usage. Also simplify the preemption count copy when we do switch stacks: just copy the whole preemption count, rather than just the softirq parts of it. There is no advantage to the partial copy: it is more effort to get a less correct result. Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202191139260.10000@i5.linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 40fc86161d92..58b7f27cb3e9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -100,13 +100,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; - /* - * Copy the softirq bits in preempt_count so that the - * softirq checks work in the hardirq context. - */ - irqctx->tinfo.preempt_count = - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | - (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + /* Copy the preempt_count so that the [soft]irq checks work. */ + irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -196,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) if (unlikely(!desc)) return false; - if (!execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); desc->handle_irq(irq, desc); -- cgit v1.2.3 From 416d7214741daba3acd6d328289858390bef37bc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 10 Feb 2012 09:24:08 -0500 Subject: xen/setup: Remove redundant filtering of PTE masks. commit 7347b4082e55ac4a673f06a0a0ce25c37273c9ec "xen: Allow unprivileged Xen domains to create iomap pages" added a redundant line in the early bootup code to filter out the PTE. That filtering is already done a bit earlier so this extra processing is not required. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12eb07bfb267..7c44e1bf981e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1204,10 +1204,6 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; - if (!xen_initial_domain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - - __supported_pte_mask |= _PAGE_IOMAP; /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; -- cgit v1.2.3 From 8eaffa67b43e99ae581622c5133e20b0f48bcef1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 10 Feb 2012 09:16:27 -0500 Subject: xen/pat: Disable PAT support for now. [Pls also look at https://lkml.org/lkml/2012/2/10/228] Using of PAT to change pages from WB to WC works quite nicely. Changing it back to WB - not so much. The crux of the matter is that the code that does this (__page_change_att_set_clr) has only limited information so when it tries to the change it gets the "raw" unfiltered information instead of the properly filtered one - and the "raw" one tell it that PSE bit is on (while infact it is not). As a result when the PTE is set to be WB from WC, we get tons of: :WARNING: at arch/x86/xen/mmu.c:475 xen_make_pte+0x67/0xa0() :Hardware name: HP xw4400 Workstation .. snip.. :Pid: 27, comm: kswapd0 Tainted: G W 3.2.2-1.fc16.x86_64 #1 :Call Trace: : [] warn_slowpath_common+0x7f/0xc0 : [] warn_slowpath_null+0x1a/0x20 : [] xen_make_pte+0x67/0xa0 : [] __raw_callee_save_xen_make_pte+0x11/0x1e : [] ? __change_page_attr_set_clr+0x9d5/0xc00 : [] ? __purge_vmap_area_lazy+0x158/0x1d0 : [] ? vm_unmap_aliases+0x175/0x190 : [] change_page_attr_set_clr+0x128/0x4c0 : [] set_pages_array_wb+0x42/0xa0 : [] ? check_events+0x12/0x20 : [] ttm_pages_put+0x1c/0x70 [ttm] : [] ttm_page_pool_free+0xf8/0x180 [ttm] : [] ttm_pool_mm_shrink+0x58/0x90 [ttm] : [] shrink_slab+0x154/0x310 : [] balance_pgdat+0x4fa/0x6c0 : [] kswapd+0x178/0x3d0 : [] ? __schedule+0x3d4/0x8c0 : [] ? remove_wait_queue+0x50/0x50 : [] ? balance_pgdat+0x6c0/0x6c0 : [] kthread+0x8c/0xa0 for every page. The proper fix for this is has been posted and is https://lkml.org/lkml/2012/2/10/228 "x86/cpa: Use pte_attrs instead of pte_flags on CPA/set_p.._wb/wc operations." along with a detailed description of the problem and solution. But since that posting has gone nowhere I am proposing this band-aid solution so that at least users don't get the page corruption (the pages that are WC don't get changed to WB and end up being recycled for filesystem or other things causing mysterious crashes). The negative impact of this patch is that users of WC flag (which are InfiniBand, radeon, nouveau drivers) won't be able to set that flag - so they are going to see performance degradation. But stability is more important here. Fixes RH BZ# 742032, 787403, and 745574 Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7c44e1bf981e..4172af8ceeb3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1141,7 +1141,9 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; +#if 0 if (!xen_initial_domain()) +#endif __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); __supported_pte_mask |= _PAGE_IOMAP; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 58a0e46c404d..95c1cf60c669 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -415,13 +415,13 @@ static pteval_t iomap_pte(pteval_t val) static pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; - +#if 0 /* If this is a WC pte, convert back from Xen WC to Linux WC */ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { WARN_ON(!pat_enabled); pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; } - +#endif if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) return pteval; @@ -463,7 +463,7 @@ void xen_set_pat(u64 pat) static pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); - +#if 0 /* If Linux is trying to set a WC pte, then map to the Xen WC. * If _PAGE_PAT is set, then it probably means it is really * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope @@ -476,7 +476,7 @@ static pte_t xen_make_pte(pteval_t pte) if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; } - +#endif /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA -- cgit v1.2.3 From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 10:24:09 -0800 Subject: i387: fix up some fpu_counter confusion This makes sure we clear the FPU usage counter for newly created tasks, just so that we start off in a known state (for example, don't try to preload the FPU state on the first task switch etc). It also fixes a thinko in when we increment the fpu_counter at task switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU state preloading at context switch time"). We should increment the *new* task fpu_counter, not the old task, and only if we decide to use that state (whether lazily or preloaded). Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 3 ++- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a850b4d8d14d..8df95849721d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (__save_init_fpu(old)) fpu_lazy_state_intact(old); __thread_clear_has_fpu(old); - old->fpu_counter++; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { + new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else @@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta } else { old->fpu_counter = 0; if (fpu.preload) { + new->fpu_counter++; if (fpu_lazy_restore(new)) fpu.preload = 0; else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 80bfe1ab0031..bc32761bc27a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1fd94bc4279d..8ad880b3bc1c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); -- cgit v1.2.3 From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 11:48:44 -0800 Subject: i387: use 'restore_fpu_checking()' directly in task switching code This inlines what is usually just a couple of instructions, but more importantly it also fixes the theoretical error case (can that FPU restore really ever fail? Maybe we should remove the checking). We can't start sending signals from within the scheduler, we're much too deep in the kernel and are holding the runqueue lock etc. So don't bother even trying. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 17 ++++++++++++++--- arch/x86/kernel/traps.c | 40 ++++++++-------------------------------- 2 files changed, 22 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 8df95849721d..74c607b37e87 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,7 +29,6 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -269,6 +268,16 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.has_fpu)); + return fpu_restore_checking(&tsk->thread.fpu); } @@ -378,8 +387,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta */ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (fpu.preload) - __math_state_restore(new); + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 77da5b475ad2..4bbe04d96744 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,37 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -/* - * This gets called with the process already owning the - * FPU state, and with CR0.TS cleared. It just needs to - * restore the FPU register state. - */ -void __math_state_restore(struct task_struct *tsk) -{ - /* We need a safe address that is cheap to find and that is already - in L1. We've just brought in "tsk->thread.has_fpu", so use that */ -#define safe_address (tsk->thread.has_fpu) - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); - force_sig(SIGSEGV, tsk); - return; - } -} - /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -631,7 +600,14 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); - __math_state_restore(tsk); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(tsk); + force_sig(SIGSEGV, tsk); + return; + } tsk->fpu_counter++; } -- cgit v1.2.3 From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 13:27:00 -0800 Subject: i387: support lazy restore of FPU state This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 35 +++++++++++++++++++++++------------ arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "emms\n\t" /* clear stack tags */ "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.has_fpu)); + [addr] "m" (tsk->thread.fpu.has_fpu)); return fpu_restore_checking(&tsk->thread.fpu); } @@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) */ static inline int __thread_has_fpu(struct task_struct *tsk) { - return tsk->thread.has_fpu; + return tsk->thread.fpu.has_fpu; } /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 1; + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); } /* @@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; * We don't do that yet, so "fpu_lazy_restore()" always returns * false, but some day.. */ -#define fpu_lazy_restore(tsk) (0) -#define fpu_lazy_state_intact(tsk) do { } while (0) +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; if (__thread_has_fpu(old)) { - if (__save_init_fpu(old)) - fpu_lazy_state_intact(old); - __thread_clear_has_fpu(old); + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; @@ -454,7 +456,6 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ - unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..b667148dfad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bc32761bc27a..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ad880b3bc1c..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu; - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0, LDT and the page table pointer: -- cgit v1.2.3 From 6bd330083e0e97b7ddc053459190bf3d5768ca83 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 6 Feb 2012 13:03:09 -0800 Subject: x86: Factor out TIF_IA32 from 32-bit address space Factor out IA32 (compatibility instruction set) from 32-bit address space in the thread_info flags; this is a precondition patch for x32 support. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org --- arch/x86/include/asm/elf.h | 4 ++-- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/include/asm/thread_info.h | 4 +++- arch/x86/kernel/process_64.c | 2 ++ arch/x86/kernel/sys_x86_64.c | 6 +++--- arch/x86/oprofile/backtrace.c | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df30d0f..410fa6a219f6 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -287,7 +287,7 @@ do { \ #define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */ /* 1GB for 64bit, 8MB for 32bit */ -#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff) +#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) #define ARCH_DLINFO \ do { \ @@ -330,7 +330,7 @@ static inline int mmap_is_ia32(void) return 1; #endif #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (test_thread_flag(TIF_ADDR32)) return 1; #endif return 0; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..9f748b5fb701 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -924,9 +924,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ +#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b443..d1803a495b35 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -86,7 +86,7 @@ struct thread_info { #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_IA32 17 /* 32bit process */ +#define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ @@ -95,6 +95,7 @@ struct thread_info { #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -116,6 +117,7 @@ struct thread_info { #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_ADDR32 (1 << TIF_ADDR32) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..0e900d09e232 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -508,6 +508,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ADDR32); /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -526,6 +527,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + set_thread_flag(TIF_ADDR32); current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..f921df8c2099 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -98,7 +98,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit @@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) + if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) && len <= mm->cached_hole_size) { mm->cached_hole_size = 0; mm->free_area_cache = begin; @@ -205,7 +205,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legact mmap base */ - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index bff89dfe3619..d6aa6e8315d1 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -67,7 +67,7 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) { struct stack_frame_ia32 *head; - /* User process is 32-bit */ + /* User process is IA32 */ if (!current || !test_thread_flag(TIF_IA32)) return 0; -- cgit v1.2.3 From 4f72e331c20ac1c656f300cee246330c1786652b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 6 Feb 2012 13:50:20 -0800 Subject: x86-64: Use explicit sizes in sigcontext.h, prepare for x32 Use explicit sizes (__u64) instead of implicit sizes (unsigned long) in the definition for sigcontext.h; this will allow this structure to be shared between the x86-64 native ABI and the x32 ABI. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-4pr1xnnksprt7t0h3w5fw4rv@git.kernel.org --- arch/x86/include/asm/sigcontext.h | 57 ++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 04459d25e66e..4a085383af27 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -230,34 +230,37 @@ struct sigcontext { * User-space might still rely on the old definition: */ struct sigcontext { - unsigned long r8; - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - unsigned long rdi; - unsigned long rsi; - unsigned long rbp; - unsigned long rbx; - unsigned long rdx; - unsigned long rax; - unsigned long rcx; - unsigned long rsp; - unsigned long rip; - unsigned long eflags; /* RFLAGS */ - unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; - unsigned long err; - unsigned long trapno; - unsigned long oldmask; - unsigned long cr2; + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 rdi; + __u64 rsi; + __u64 rbp; + __u64 rbx; + __u64 rdx; + __u64 rax; + __u64 rcx; + __u64 rsp; + __u64 rip; + __u64 eflags; /* RFLAGS */ + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; struct _fpstate __user *fpstate; /* zero when no FPU context */ - unsigned long reserved1[8]; +#ifndef __LP64__ + __u32 __fpstate_pad; +#endif + __u64 reserved1[8]; }; #endif /* !__KERNEL__ */ -- cgit v1.2.3 From 1f5e27a90add2fe2a1c11508f68d377e3ddcf9ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 13:13:21 -0800 Subject: x32: Create posix_types_x32.h This is the same as the 64-bit posix_types.h, except that __kernel_[u]long_t is defined to be [unsigned] long long and therefore 64 bits. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/posix_types.h | 4 +++- arch/x86/include/asm/posix_types_x32.h | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/posix_types_x32.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index b57e6a43a37a..986954fb9513 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -14,6 +14,7 @@ header-y += msr.h header-y += mtrr.h header-y += posix_types_32.h header-y += posix_types_64.h +header-y += posix_types_x32.h header-y += prctl.h header-y += processor-flags.h header-y += ptrace-abi.h diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h index bb7133dc155d..3427b7798dbc 100644 --- a/arch/x86/include/asm/posix_types.h +++ b/arch/x86/include/asm/posix_types.h @@ -7,7 +7,9 @@ #else # ifdef __i386__ # include "posix_types_32.h" -# else +# elif defined(__LP64__) # include "posix_types_64.h" +# else +# include "posix_types_x32.h" # endif #endif diff --git a/arch/x86/include/asm/posix_types_x32.h b/arch/x86/include/asm/posix_types_x32.h new file mode 100644 index 000000000000..85f9bdafa93c --- /dev/null +++ b/arch/x86/include/asm/posix_types_x32.h @@ -0,0 +1,19 @@ +#ifndef _ASM_X86_POSIX_TYPES_X32_H +#define _ASM_X86_POSIX_TYPES_X32_H + +/* + * This file is only used by user-level software, so you need to + * be a little careful about namespace pollution etc. Also, we cannot + * assume GCC is being used. + * + * These types should generally match the ones used by the 64-bit kernel, + * + */ + +typedef long long __kernel_long_t; +typedef unsigned long long __kernel_ulong_t; +#define __kernel_long_t __kernel_long_t + +#include + +#endif /* _ASM_X86_POSIX_TYPES_X32_H */ -- cgit v1.2.3 From d046ff8b30319d9aa38d877a0ba4206771e54346 Mon Sep 17 00:00:00 2001 From: "H. J. Lu" Date: Tue, 14 Feb 2012 13:49:48 -0800 Subject: x86-64: Add prototype for old_rsp to a header file So far this has only been used in process_64.c, but the x32 code will need it in additional code. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9f748b5fb701..e34f95129f16 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -948,6 +948,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); + +/* + * User space RSP while inside the SYSCALL fast path + */ +DECLARE_PER_CPU(unsigned long, old_rsp); + #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, -- cgit v1.2.3 From bb2127240c5595ae4ef7115494f51e973692f64e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 13:56:49 -0800 Subject: x32: Add a thread flag for x32 processes An x32 process is *almost* the same thing as a 64-bit process with a 32-bit address limit, but there are a few minor differences -- in particular core dumps are 32 bits and signal handling is different. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/thread_info.h | 2 ++ arch/x86/kernel/process_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d1803a495b35..912e93511466 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -96,6 +96,7 @@ struct thread_info { #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ +#define TIF_X32 30 /* 32-bit native x86-64 binary */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,6 +119,7 @@ struct thread_info { #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) +#define _TIF_X32 (1 << TIF_X32) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e900d09e232..5fe2fbaa56ba 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -509,6 +509,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -528,6 +529,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); set_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ -- cgit v1.2.3 From 2c73ce734653f96542a070f3c3b3e3d1cd0fba02 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 09:48:01 -0800 Subject: x86-64, ia32: Drop sys32_rt_sigprocmask On x86, the only difference between sys_rt_sigprocmask and sys32_rt_sigprocmask is the alignment of the data structures. However, x86 allows data accesses with arbitrary alignment, and therefore there is no reason for this code to be different. Reported-by: Gregory M. Lueck Signed-off-by: H. Peter Anvin --- arch/x86/ia32/sys_ia32.c | 40 ---------------------------------------- arch/x86/include/asm/sys_ia32.h | 2 -- arch/x86/syscalls/syscall_32.tbl | 2 +- 3 files changed, 1 insertion(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index f6f5c53dc903..aec2202a596c 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,46 +287,6 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, return ret; } -asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, - unsigned int sigsetsize) -{ - sigset_t s; - compat_sigset_t s32; - int ret; - mm_segment_t old_fs = get_fs(); - - if (set) { - if (copy_from_user(&s32, set, sizeof(compat_sigset_t))) - return -EFAULT; - switch (_NSIG_WORDS) { - case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); - case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); - case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); - case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); - } - } - set_fs(KERNEL_DS); - ret = sys_rt_sigprocmask(how, - set ? (sigset_t __user *)&s : NULL, - oset ? (sigset_t __user *)&s : NULL, - sigsetsize); - set_fs(old_fs); - if (ret) - return ret; - if (oset) { - switch (_NSIG_WORDS) { - case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; - case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; - case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; - case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; - } - if (copy_to_user(oset, &s32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; -} - asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index cb238526a9f1..68da87bfb5a3 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -36,8 +36,6 @@ asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, struct sigaction32 __user *, unsigned int); asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, struct old_sigaction32 __user *); -asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *, - compat_sigset_t __user *, unsigned int); asmlinkage long sys32_alarm(unsigned int); asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index ce98e287c066..031cef84fe43 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -181,7 +181,7 @@ 172 i386 prctl sys_prctl 173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask +175 i386 rt_sigprocmask sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending 177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo -- cgit v1.2.3 From 6630f11ba54414b9870d87dfef2bee467bfa842a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 14:18:50 -0800 Subject: x32: Add x32 system calls to syscall/syscall_64.tbl Split the 64-bit system calls into "64" (64-bit only) and "common" (64-bit or x32) and add the x32 system call numbers. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 2 + arch/x86/kernel/syscall_64.c | 3 + arch/x86/syscalls/Makefile | 2 +- arch/x86/syscalls/syscall_64.tbl | 579 +++++++++++++++++++++------------------ arch/x86/um/sys_call_table_64.c | 3 + arch/x86/um/user-offsets.c | 2 + 6 files changed, 317 insertions(+), 274 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 834e897b1e25..c3354f7b0a06 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,6 +1,8 @@ #include #define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ static char syscalls_64[] = { #include }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 7ac7943be02c..26c4ca1f20e8 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,6 +5,9 @@ #include #include +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ + #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include #undef __SYSCALL_64 diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 564b2476fede..89dd9581c5a2 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -24,7 +24,7 @@ syshdr_pfx_unistd_32_ia32 := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) -syshdr_abi_unistd_64 := 64 +syshdr_abi_unistd_64 := common,64 $(out)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index b440a8f7eefa..4aecc7e31166 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -4,317 +4,350 @@ # The format is: # # -# The abi is always "64" for this file (for now.) +# The abi is "common", "64" or "x32" for this file. # -0 64 read sys_read -1 64 write sys_write -2 64 open sys_open -3 64 close sys_close -4 64 stat sys_newstat -5 64 fstat sys_newfstat -6 64 lstat sys_newlstat -7 64 poll sys_poll -8 64 lseek sys_lseek -9 64 mmap sys_mmap -10 64 mprotect sys_mprotect -11 64 munmap sys_munmap -12 64 brk sys_brk +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction -14 64 rt_sigprocmask sys_rt_sigprocmask +14 common rt_sigprocmask sys_rt_sigprocmask 15 64 rt_sigreturn stub_rt_sigreturn 16 64 ioctl sys_ioctl -17 64 pread64 sys_pread64 -18 64 pwrite64 sys_pwrite64 +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 19 64 readv sys_readv 20 64 writev sys_writev -21 64 access sys_access -22 64 pipe sys_pipe -23 64 select sys_select -24 64 sched_yield sys_sched_yield -25 64 mremap sys_mremap -26 64 msync sys_msync -27 64 mincore sys_mincore -28 64 madvise sys_madvise -29 64 shmget sys_shmget -30 64 shmat sys_shmat -31 64 shmctl sys_shmctl -32 64 dup sys_dup -33 64 dup2 sys_dup2 -34 64 pause sys_pause -35 64 nanosleep sys_nanosleep -36 64 getitimer sys_getitimer -37 64 alarm sys_alarm -38 64 setitimer sys_setitimer -39 64 getpid sys_getpid -40 64 sendfile sys_sendfile64 -41 64 socket sys_socket -42 64 connect sys_connect -43 64 accept sys_accept -44 64 sendto sys_sendto +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto 45 64 recvfrom sys_recvfrom 46 64 sendmsg sys_sendmsg 47 64 recvmsg sys_recvmsg -48 64 shutdown sys_shutdown -49 64 bind sys_bind -50 64 listen sys_listen -51 64 getsockname sys_getsockname -52 64 getpeername sys_getpeername -53 64 socketpair sys_socketpair -54 64 setsockopt sys_setsockopt -55 64 getsockopt sys_getsockopt -56 64 clone stub_clone -57 64 fork stub_fork -58 64 vfork stub_vfork +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 common setsockopt sys_setsockopt +55 common getsockopt sys_getsockopt +56 common clone stub_clone +57 common fork stub_fork +58 common vfork stub_vfork 59 64 execve stub_execve -60 64 exit sys_exit -61 64 wait4 sys_wait4 -62 64 kill sys_kill -63 64 uname sys_newuname -64 64 semget sys_semget -65 64 semop sys_semop -66 64 semctl sys_semctl -67 64 shmdt sys_shmdt -68 64 msgget sys_msgget -69 64 msgsnd sys_msgsnd -70 64 msgrcv sys_msgrcv -71 64 msgctl sys_msgctl -72 64 fcntl sys_fcntl -73 64 flock sys_flock -74 64 fsync sys_fsync -75 64 fdatasync sys_fdatasync -76 64 truncate sys_truncate -77 64 ftruncate sys_ftruncate -78 64 getdents sys_getdents -79 64 getcwd sys_getcwd -80 64 chdir sys_chdir -81 64 fchdir sys_fchdir -82 64 rename sys_rename -83 64 mkdir sys_mkdir -84 64 rmdir sys_rmdir -85 64 creat sys_creat -86 64 link sys_link -87 64 unlink sys_unlink -88 64 symlink sys_symlink -89 64 readlink sys_readlink -90 64 chmod sys_chmod -91 64 fchmod sys_fchmod -92 64 chown sys_chown -93 64 fchown sys_fchown -94 64 lchown sys_lchown -95 64 umask sys_umask -96 64 gettimeofday sys_gettimeofday -97 64 getrlimit sys_getrlimit -98 64 getrusage sys_getrusage -99 64 sysinfo sys_sysinfo +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo 100 64 times sys_times -101 64 ptrace sys_ptrace -102 64 getuid sys_getuid -103 64 syslog sys_syslog -104 64 getgid sys_getgid -105 64 setuid sys_setuid -106 64 setgid sys_setgid -107 64 geteuid sys_geteuid -108 64 getegid sys_getegid -109 64 setpgid sys_setpgid -110 64 getppid sys_getppid -111 64 getpgrp sys_getpgrp -112 64 setsid sys_setsid -113 64 setreuid sys_setreuid -114 64 setregid sys_setregid -115 64 getgroups sys_getgroups -116 64 setgroups sys_setgroups -117 64 setresuid sys_setresuid -118 64 getresuid sys_getresuid -119 64 setresgid sys_setresgid -120 64 getresgid sys_getresgid -121 64 getpgid sys_getpgid -122 64 setfsuid sys_setfsuid -123 64 setfsgid sys_setfsgid -124 64 getsid sys_getsid -125 64 capget sys_capget -126 64 capset sys_capset +101 common ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset 127 64 rt_sigpending sys_rt_sigpending 128 64 rt_sigtimedwait sys_rt_sigtimedwait 129 64 rt_sigqueueinfo sys_rt_sigqueueinfo -130 64 rt_sigsuspend sys_rt_sigsuspend +130 common rt_sigsuspend sys_rt_sigsuspend 131 64 sigaltstack stub_sigaltstack -132 64 utime sys_utime -133 64 mknod sys_mknod +132 common utime sys_utime +133 common mknod sys_mknod 134 64 uselib -135 64 personality sys_personality -136 64 ustat sys_ustat -137 64 statfs sys_statfs -138 64 fstatfs sys_fstatfs -139 64 sysfs sys_sysfs -140 64 getpriority sys_getpriority -141 64 setpriority sys_setpriority -142 64 sched_setparam sys_sched_setparam -143 64 sched_getparam sys_sched_getparam -144 64 sched_setscheduler sys_sched_setscheduler -145 64 sched_getscheduler sys_sched_getscheduler -146 64 sched_get_priority_max sys_sched_get_priority_max -147 64 sched_get_priority_min sys_sched_get_priority_min -148 64 sched_rr_get_interval sys_sched_rr_get_interval -149 64 mlock sys_mlock -150 64 munlock sys_munlock -151 64 mlockall sys_mlockall -152 64 munlockall sys_munlockall -153 64 vhangup sys_vhangup -154 64 modify_ldt sys_modify_ldt -155 64 pivot_root sys_pivot_root +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root 156 64 _sysctl sys_sysctl -157 64 prctl sys_prctl -158 64 arch_prctl sys_arch_prctl -159 64 adjtimex sys_adjtimex -160 64 setrlimit sys_setrlimit -161 64 chroot sys_chroot -162 64 sync sys_sync -163 64 acct sys_acct -164 64 settimeofday sys_settimeofday -165 64 mount sys_mount -166 64 umount2 sys_umount -167 64 swapon sys_swapon -168 64 swapoff sys_swapoff -169 64 reboot sys_reboot -170 64 sethostname sys_sethostname -171 64 setdomainname sys_setdomainname -172 64 iopl stub_iopl -173 64 ioperm sys_ioperm +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl stub_iopl +173 common ioperm sys_ioperm 174 64 create_module -175 64 init_module sys_init_module -176 64 delete_module sys_delete_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module 177 64 get_kernel_syms 178 64 query_module -179 64 quotactl sys_quotactl +179 common quotactl sys_quotactl 180 64 nfsservctl -181 64 getpmsg -182 64 putpmsg -183 64 afs_syscall -184 64 tuxcall -185 64 security -186 64 gettid sys_gettid -187 64 readahead sys_readahead -188 64 setxattr sys_setxattr -189 64 lsetxattr sys_lsetxattr -190 64 fsetxattr sys_fsetxattr -191 64 getxattr sys_getxattr -192 64 lgetxattr sys_lgetxattr -193 64 fgetxattr sys_fgetxattr -194 64 listxattr sys_listxattr -195 64 llistxattr sys_llistxattr -196 64 flistxattr sys_flistxattr -197 64 removexattr sys_removexattr -198 64 lremovexattr sys_lremovexattr -199 64 fremovexattr sys_fremovexattr -200 64 tkill sys_tkill -201 64 time sys_time -202 64 futex sys_futex -203 64 sched_setaffinity sys_sched_setaffinity -204 64 sched_getaffinity sys_sched_getaffinity +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity 205 64 set_thread_area -206 64 io_setup sys_io_setup -207 64 io_destroy sys_io_destroy -208 64 io_getevents sys_io_getevents -209 64 io_submit sys_io_submit -210 64 io_cancel sys_io_cancel +206 common io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 common io_submit sys_io_submit +210 common io_cancel sys_io_cancel 211 64 get_thread_area -212 64 lookup_dcookie sys_lookup_dcookie -213 64 epoll_create sys_epoll_create +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old -216 64 remap_file_pages sys_remap_file_pages -217 64 getdents64 sys_getdents64 -218 64 set_tid_address sys_set_tid_address -219 64 restart_syscall sys_restart_syscall -220 64 semtimedop sys_semtimedop -221 64 fadvise64 sys_fadvise64 +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 222 64 timer_create sys_timer_create -223 64 timer_settime sys_timer_settime -224 64 timer_gettime sys_timer_gettime -225 64 timer_getoverrun sys_timer_getoverrun -226 64 timer_delete sys_timer_delete -227 64 clock_settime sys_clock_settime -228 64 clock_gettime sys_clock_gettime -229 64 clock_getres sys_clock_getres -230 64 clock_nanosleep sys_clock_nanosleep -231 64 exit_group sys_exit_group -232 64 epoll_wait sys_epoll_wait -233 64 epoll_ctl sys_epoll_ctl -234 64 tgkill sys_tgkill -235 64 utimes sys_utimes +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes 236 64 vserver -237 64 mbind sys_mbind -238 64 set_mempolicy sys_set_mempolicy -239 64 get_mempolicy sys_get_mempolicy -240 64 mq_open sys_mq_open -241 64 mq_unlink sys_mq_unlink -242 64 mq_timedsend sys_mq_timedsend -243 64 mq_timedreceive sys_mq_timedreceive +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive 244 64 mq_notify sys_mq_notify -245 64 mq_getsetattr sys_mq_getsetattr +245 common mq_getsetattr sys_mq_getsetattr 246 64 kexec_load sys_kexec_load 247 64 waitid sys_waitid -248 64 add_key sys_add_key -249 64 request_key sys_request_key -250 64 keyctl sys_keyctl -251 64 ioprio_set sys_ioprio_set -252 64 ioprio_get sys_ioprio_get -253 64 inotify_init sys_inotify_init -254 64 inotify_add_watch sys_inotify_add_watch -255 64 inotify_rm_watch sys_inotify_rm_watch -256 64 migrate_pages sys_migrate_pages -257 64 openat sys_openat -258 64 mkdirat sys_mkdirat -259 64 mknodat sys_mknodat -260 64 fchownat sys_fchownat -261 64 futimesat sys_futimesat -262 64 newfstatat sys_newfstatat -263 64 unlinkat sys_unlinkat -264 64 renameat sys_renameat -265 64 linkat sys_linkat -266 64 symlinkat sys_symlinkat -267 64 readlinkat sys_readlinkat -268 64 fchmodat sys_fchmodat -269 64 faccessat sys_faccessat -270 64 pselect6 sys_pselect6 -271 64 ppoll sys_ppoll -272 64 unshare sys_unshare +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare 273 64 set_robust_list sys_set_robust_list 274 64 get_robust_list sys_get_robust_list -275 64 splice sys_splice -276 64 tee sys_tee -277 64 sync_file_range sys_sync_file_range +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range 278 64 vmsplice sys_vmsplice 279 64 move_pages sys_move_pages -280 64 utimensat sys_utimensat -281 64 epoll_pwait sys_epoll_pwait -282 64 signalfd sys_signalfd -283 64 timerfd_create sys_timerfd_create -284 64 eventfd sys_eventfd -285 64 fallocate sys_fallocate -286 64 timerfd_settime sys_timerfd_settime -287 64 timerfd_gettime sys_timerfd_gettime -288 64 accept4 sys_accept4 -289 64 signalfd4 sys_signalfd4 -290 64 eventfd2 sys_eventfd2 -291 64 epoll_create1 sys_epoll_create1 -292 64 dup3 sys_dup3 -293 64 pipe2 sys_pipe2 -294 64 inotify_init1 sys_inotify_init1 +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 295 64 preadv sys_preadv 296 64 pwritev sys_pwritev 297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo -298 64 perf_event_open sys_perf_event_open +298 common perf_event_open sys_perf_event_open 299 64 recvmmsg sys_recvmmsg -300 64 fanotify_init sys_fanotify_init -301 64 fanotify_mark sys_fanotify_mark -302 64 prlimit64 sys_prlimit64 -303 64 name_to_handle_at sys_name_to_handle_at -304 64 open_by_handle_at sys_open_by_handle_at -305 64 clock_adjtime sys_clock_adjtime -306 64 syncfs sys_syncfs +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs 307 64 sendmmsg sys_sendmmsg -308 64 setns sys_setns -309 64 getcpu sys_getcpu +308 common setns sys_setns +309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction sys32_rt_sigaction +513 x32 rt_sigreturn stub_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve stub_x32_execve +521 x32 times compat_sys_times +522 x32 rt_sigpending sys32_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo +525 x32 sigaltstack stub_x32_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index fe626c3ba01b..9924776f4265 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,6 +35,9 @@ #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ + #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 5edf4f4bbf53..ce7e3607a870 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -15,6 +15,8 @@ static char syscalls[] = { }; #else #define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ static char syscalls[] = { #include }; -- cgit v1.2.3 From 6cbb369f578378cf5b1876766d860ae7c2a94d60 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 14:38:31 -0800 Subject: x32: Generate Generate ; this exports x32 system call numbers to user space. [ v2: Enclose all arguments to syshdr in '' so empty arguments aren't dropped on the floor. ] Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/unistd.h | 7 ++++++- arch/x86/syscalls/Makefile | 13 ++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 986954fb9513..f9c0d3ba9e84 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -25,3 +25,4 @@ header-y += vsyscall.h genhdr-y += unistd_32.h genhdr-y += unistd_64.h +genhdr-y += unistd_x32.h diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 21f77b89e47a..dab5349f14fc 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,6 +1,9 @@ #ifndef _ASM_X86_UNISTD_H #define _ASM_X86_UNISTD_H 1 +/* x32 syscall flag bit */ +#define __X32_SYSCALL_BIT 0x40000000 + #ifdef __KERNEL__ # ifdef CONFIG_X86_32 @@ -52,8 +55,10 @@ #else # ifdef __i386__ # include -# else +# elif defined(__LP64__) # include +# else +# include # endif #endif diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 89dd9581c5a2..8051c3134ad1 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -10,8 +10,10 @@ syshdr := $(srctree)/$(src)/syscallhdr.sh systbl := $(srctree)/$(src)/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \ - $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget)) + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ + '$(syshdr_abi_$(basetarget))' \ + '$(syshdr_pfx_$(basetarget))' \ + '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ @@ -24,6 +26,11 @@ syshdr_pfx_unistd_32_ia32 := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) $(call if_changed,syshdr) +syshdr_abi_unistd_x32 := common,x32 +syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(out)/unistd_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + syshdr_abi_unistd_64 := common,64 $(out)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) @@ -33,7 +40,7 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl) $(out)/syscalls_64.h: $(syscall64) $(systbl) $(call if_changed,systbl) -syshdr-y += unistd_32.h unistd_64.h +syshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h -- cgit v1.2.3 From ea499fec48dd771bd92984337fcb57ed4c787e69 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Feb 2012 14:46:23 -0800 Subject: x32: Generate Generate macros for the *kernel* code to use to refer to x32 system calls. These have an __NR_x32_ prefix and do not include __X32_SYSCALL_BIT. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/unistd.h | 1 + arch/x86/syscalls/Makefile | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index dab5349f14fc..7a48a5557470 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -17,6 +17,7 @@ # else # include +# include # define __ARCH_WANT_COMPAT_SYS_TIME # endif diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile index 8051c3134ad1..3236aebc828d 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/syscalls/Makefile @@ -35,6 +35,11 @@ syshdr_abi_unistd_64 := common,64 $(out)/unistd_64.h: $(syscall64) $(syshdr) $(call if_changed,syshdr) +syshdr_abi_unistd_64_x32 := x32 +syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + $(out)/syscalls_32.h: $(syscall32) $(systbl) $(call if_changed,systbl) $(out)/syscalls_64.h: $(syscall64) $(systbl) @@ -42,7 +47,7 @@ $(out)/syscalls_64.h: $(syscall64) $(systbl) syshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h -syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h targets += $(syshdr-y) -- cgit v1.2.3 From f28f0c23576662fb293defe9b1884d5a6e1bd85c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:38:43 -0800 Subject: x86: Move some signal-handling definitions to a common header There are some definitions which are duplicated between kernel/signal.c and ia32/ia32_signal.c; move them to a common header file. Rather than adding stuff to existing header files which contain data structures, create a new header file; hence the slightly odd name ("all the good ones were taken.") Note: nothing relied on signal_fault() being defined in . Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 12 ++---------- arch/x86/include/asm/ptrace.h | 1 - arch/x86/include/asm/sighandling.h | 19 +++++++++++++++++++ arch/x86/kernel/signal.c | 10 +--------- 4 files changed, 22 insertions(+), 20 deletions(-) create mode 100644 arch/x86/include/asm/sighandling.h (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 65577698cab2..25d80f3faf2e 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -12,10 +12,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -31,16 +29,10 @@ #include #include #include +#include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +#define FIX_EFLAGS __FIX_EFLAGS int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 35664547125b..dcfde52979c3 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -145,7 +145,6 @@ extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h new file mode 100644 index 000000000000..843e299e120e --- /dev/null +++ b/arch/x86/include/asm/sighandling.h @@ -0,0 +1,19 @@ +#ifndef _ASM_X86_SIGHANDLING_H +#define _ASM_X86_SIGHANDLING_H + +#include +#include +#include + +#include + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ + X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ + X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ + X86_EFLAGS_CF) + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + +#endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e2..c432dc0e65f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -10,10 +10,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -26,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -37,13 +36,6 @@ #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - #ifdef CONFIG_X86_32 # define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) #else -- cgit v1.2.3 From 851394229e79c11b0b5b74c509817848e9a80564 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:43:09 -0800 Subject: x32: Export setup/restore_sigcontext from signal.c Export setup_sigcontext() and restore_sigcontext() from signal.c, so we can use the 64-bit versions verbatim for x32. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sighandling.h | 5 +++++ arch/x86/kernel/signal.c | 10 ++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 843e299e120e..ada93b3b8c66 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -16,4 +16,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax); +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask); + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c432dc0e65f0..450fb255f877 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -60,9 +60,8 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) { void __user *buf; unsigned int tmpflags; @@ -117,9 +116,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) { int err = 0; -- cgit v1.2.3 From 4048e2a8d4b491a69bf47ceda12cc0c0b924f6b8 Mon Sep 17 00:00:00 2001 From: "H. J. Lu" Date: Sun, 19 Feb 2012 07:46:08 -0800 Subject: x32: Add struct ucontext_x32 Add a definition for struct ucontext_x32; this is inherently a mix of the 32- and 64-bit versions. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ia32.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 1f7e62517284..c6435ab1cc13 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -43,6 +43,15 @@ struct ucontext_ia32 { compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; +struct ucontext_x32 { + unsigned int uc_flags; + unsigned int uc_link; + stack_ia32_t uc_stack; + unsigned int uc__pad0; /* needed for alignment */ + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ -- cgit v1.2.3 From 9d3897630e14b3d33bcb24a3c0fa9d60a01d3058 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:50:12 -0800 Subject: x32: Add rt_sigframe_x32 Add rt_sigframe_x32 to . Unfortunately we can't just define all the data structures unconditionally, due to the #ifdef CONFIG_COMPAT in and its trickle-down effects, hence the #ifdef mess. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sigframe.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 4e0fe26d27d3..7c7c27c97daa 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -59,12 +59,25 @@ struct rt_sigframe_ia32 { #endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */ #ifdef CONFIG_X86_64 + struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; + +#ifdef CONFIG_X86_X32_ABI + +struct rt_sigframe_x32 { + u64 pretcode; + struct ucontext_x32 uc; + compat_siginfo_t info; + /* fp state follows here */ +}; + +#endif /* CONFIG_X86_X32_ABI */ + #endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_SIGFRAME_H */ -- cgit v1.2.3 From fca460f95e928bae373daa8295877b6905bc62b8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 07:56:26 -0800 Subject: x32: Handle the x32 system call flag x32 shares most system calls with x86-64, but unfortunately some subsystem (the input subsystem is the chief offender) which require is_compat() when operating with a 32-bit userspace. The input system actually has text files in sysfs whose meaning is dependent on sizeof(long) in userspace! We could solve this by having two completely disjoint system call tables; requiring that each system call be duplicated. This patch takes a different approach: we add a flag to the system call number; this flag doesn't affect the system call dispatch but requests compat treatment from affected subsystems for the duration of the system call. The change of cmpq to cmpl is safe since it immediately follows the and. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/compat.h | 13 +++++++++++-- arch/x86/include/asm/syscall.h | 5 +++-- arch/x86/include/asm/unistd.h | 7 +++++++ arch/x86/kernel/entry_64.S | 10 ++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 30d737ef2a42..7938b84e4506 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -7,6 +7,7 @@ #include #include #include +#include #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" @@ -212,9 +213,17 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)regs->sp - len; } -static inline int is_compat_task(void) +static inline bool is_compat_task(void) { - return current_thread_info()->status & TS_COMPAT; +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & TS_COMPAT) + return true; +#endif +#ifdef CONFIG_X86_X32_ABI + if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) + return true; +#endif + return false; } #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d962e5652a73..386b78686c4d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,7 @@ #include #include #include /* For NR_syscalls */ +#include extern const unsigned long sys_call_table[]; @@ -26,13 +27,13 @@ extern const unsigned long sys_call_table[]; */ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - return regs->orig_ax; + return regs->orig_ax & __SYSCALL_MASK; } static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { - regs->ax = regs->orig_ax; + regs->ax = regs->orig_ax & __SYSCALL_MASK; } static inline long syscall_get_error(struct task_struct *task, diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 7a48a5557470..37cdc9d99bb1 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -5,6 +5,13 @@ #define __X32_SYSCALL_BIT 0x40000000 #ifdef __KERNEL__ + +# ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) +# else +# define __SYSCALL_MASK (~0) +# endif + # ifdef CONFIG_X86_32 # include diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..a17b34216971 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -482,7 +482,12 @@ GLOBAL(system_call_after_swapgs) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja badsys movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative @@ -596,7 +601,12 @@ tracesys: */ LOAD_ARGS ARGOFFSET, 1 RESTORE_REST +#if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) -- cgit v1.2.3 From a96d692e9a559980f269f81c9b0b094220382186 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 14:02:46 -0800 Subject: x86: Add #ifdef CONFIG_COMPAT to Unfortunately a lot of the compat types are guarded with CONFIG_COMPAT or the equivalent, so add a similar guard to to avoid compilation failures when CONFIG_COMPAT=n. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sys_ia32.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 68da87bfb5a3..3fda9db48819 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -10,6 +10,8 @@ #ifndef _ASM_X86_SYS_IA32_H #define _ASM_X86_SYS_IA32_H +#ifdef CONFIG_COMPAT + #include #include #include @@ -81,4 +83,7 @@ asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int, const char __user *); + +#endif /* CONFIG_COMPAT */ + #endif /* _ASM_X86_SYS_IA32_H */ -- cgit v1.2.3 From c5a373942bbc41698724fc948c74f959f73407e5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 09:41:09 -0800 Subject: x32: Signal-related system calls x32 uses the 64-bit signal frame format, obviously, but there are some structures which mixes that with pointers or sizeof(long) types, as such we have to create a handful of system calls specific to x32. By and large these are a mixture of the 64-bit and the compat system calls. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 19 ++++++++ arch/x86/kernel/signal.c | 118 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a17b34216971..53dc821f0a62 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -746,6 +746,25 @@ ENTRY(stub_rt_sigreturn) CFI_ENDPROC END(stub_rt_sigreturn) +#ifdef CONFIG_X86_X32_ABI + PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx + +ENTRY(stub_x32_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys32_x32_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_rt_sigreturn) + +#endif + /* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 450fb255f877..c3846b6fb726 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -29,6 +29,7 @@ #ifdef CONFIG_X86_64 #include #include +#include #endif /* CONFIG_X86_64 */ #include @@ -632,6 +633,16 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +#ifdef CONFIG_X86_X32_ABI +#define is_x32 test_thread_flag(TIF_X32) + +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs); +#else /* !CONFIG_X86_X32_ABI */ +#define is_x32 0 +#endif /* CONFIG_X86_X32_ABI */ + int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs); int ia32_setup_frame(int sig, struct k_sigaction *ka, @@ -656,8 +667,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else ret = ia32_setup_frame(usig, ka, set, regs); - } else +#ifdef CONFIG_X86_X32_ABI + } else if (is_x32) { + ret = x32_setup_rt_frame(usig, ka, info, + (compat_sigset_t *)set, regs); +#endif + } else { ret = __setup_rt_frame(sig, ka, info, set, regs); + } if (ret) { force_sigsegv(sig, current); @@ -840,3 +857,102 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV, me); } + +#ifdef CONFIG_X86_X32_ABI +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, + siginfo_t *info, compat_sigset_t *set, + struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + put_user_ex(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + put_user_ex(0, &frame->uc.uc__pad0); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; + + return 0; +} + +asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long ax; + struct pt_regs tregs; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + tregs = *regs; + if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif -- cgit v1.2.3 From d1a797f388d6d30fa502915d1b9937ed758b7137 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 10:06:34 -0800 Subject: x32: Handle process creation Allow an x32 process to be started. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Cc: Peter Zijlstra --- arch/x86/include/asm/compat.h | 26 ++++++++++++++++++++++++-- arch/x86/include/asm/elf.h | 25 +++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event.c | 4 +++- arch/x86/kernel/entry_64.S | 15 +++++++++++++++ arch/x86/kernel/process_64.c | 23 ++++++++++++++++------- 5 files changed, 79 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7938b84e4506..e7f68b49c01a 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include @@ -187,7 +188,20 @@ struct compat_shmid64_ds { /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ +#ifdef CONFIG_X86_X32_ABI +typedef struct user_regs_struct compat_elf_gregset_t; + +#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216) +#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296) +#define SET_PR_FPVALID(S,V) \ + do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \ + while (0) + +#define COMPAT_USE_64BIT_TIME \ + (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) +#else typedef struct user_regs_struct32 compat_elf_gregset_t; +#endif /* * A pointer passed in from user mode. This should not @@ -209,8 +223,16 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) static inline void __user *arch_compat_alloc_user_space(long len) { - struct pt_regs *regs = task_pt_regs(current); - return (void __user *)regs->sp - len; + compat_uptr_t sp; + + if (test_thread_flag(TIF_IA32)) { + sp = task_pt_regs(current)->sp; + } else { + /* -128 for the x32 ABI redzone */ + sp = percpu_read(old_rsp) - 128; + } + + return (void __user *)round_down(sp - len, 16); } static inline bool is_compat_task(void) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 410fa6a219f6..83aabea95dd7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -156,7 +156,12 @@ do { \ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) -#define compat_elf_check_arch(x) elf_check_arch_ia32(x) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) + +#if __USER32_DS != __USER_DS +# error "The following code assumes __USER32_DS == __USER_DS" +#endif static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) @@ -179,8 +184,9 @@ static inline void elf_common_init(struct thread_struct *t, void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -void set_personality_ia32(void); -#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() +void set_personality_ia32(bool); +#define COMPAT_SET_PERSONALITY(ex) \ + set_personality_ia32((ex).e_machine == EM_X86_64) #define COMPAT_ELF_PLATFORM ("i686") @@ -296,9 +302,20 @@ do { \ (unsigned long)current->mm->context.vdso); \ } while (0) +#define ARCH_DLINFO_X32 \ +do { \ + if (vdso_enabled) \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso); \ +} while (0) + #define AT_SYSINFO 32 -#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32) +#define COMPAT_ARCH_DLINFO \ +if (test_thread_flag(TIF_X32)) \ + ARCH_DLINFO_X32; \ +else \ + ARCH_DLINFO_IA32(sysctl_vsyscall32) #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..63c0e058a405 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -1595,6 +1594,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) } #ifdef CONFIG_COMPAT + +#include + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 53dc821f0a62..9e036f0ce5e0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -763,6 +763,21 @@ ENTRY(stub_x32_rt_sigreturn) CFI_ENDPROC END(stub_x32_rt_sigreturn) +ENTRY(stub_x32_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys32_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execve) + #endif /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5fe2fbaa56ba..a0701da2bd18 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -364,7 +364,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, - __USER32_CS, __USER32_DS, __USER32_DS); + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); } #endif @@ -508,6 +510,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); @@ -522,22 +525,28 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(void) +void set_personality_ia32(bool x32) { /* inherit personality from parent */ /* Make sure to be in 32bit mode */ - set_thread_flag(TIF_IA32); set_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); - current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ if (current->mm) current->mm->context.ia32_compat = 1; - /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + if (x32) { + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + current->personality &= ~READ_IMPLIES_EXEC; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; + } } unsigned long get_wchan(struct task_struct *p) -- cgit v1.2.3 From a06c9bc0647f66df0534fb887ddf6cddd35f426c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 11:08:37 -0800 Subject: x32: If configured, add x32 system calls to system call tables If CONFIG_X86_X32_ABI is defined, add the x32 system calls to the system call tables. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 6 +++++- arch/x86/kernel/syscall_64.c | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index c3354f7b0a06..1b4754f82ba7 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -2,7 +2,11 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = 1, #define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif static char syscalls_64[] = { #include }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 26c4ca1f20e8..5c7f8c20da74 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -6,7 +6,12 @@ #include #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#define __SYSCALL_X32(nr, sym, compat) /* Not yet */ + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif #define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #include -- cgit v1.2.3 From 5fd92e65a68b813667bc8739f5fa463e5bfcd66d Mon Sep 17 00:00:00 2001 From: "H. J. Lu" Date: Sun, 19 Feb 2012 10:40:03 -0800 Subject: x32: Allow x32 to be configured At this point, one should be able to build an x32 kernel. Note that for now we depend on CONFIG_IA32_EMULATION. Long term, x32 and IA32 should be detangled. Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..c9d6c9ed27e5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2165,9 +2165,9 @@ config IA32_EMULATION depends on X86_64 select COMPAT_BINFMT_ELF ---help--- - Include code to run 32-bit programs under a 64-bit kernel. You should - likely turn this on, unless you're 100% sure that you don't have any - 32-bit programs left. + Include code to run legacy 32-bit programs under a + 64-bit kernel. You should likely turn this on, unless you're + 100% sure that you don't have any 32-bit programs left. config IA32_AOUT tristate "IA32 a.out support" @@ -2175,9 +2175,22 @@ config IA32_AOUT ---help--- Support old a.out binaries in the 32bit emulation. +config X86_X32_ABI + bool "x32 ABI for 64-bit mode (EXPERIMENTAL)" + depends on X86_64 && IA32_EMULATION && EXPERIMENTAL + ---help--- + Include code to run binaries for the x32 native 32-bit ABI + for 64-bit processors. An x32 process gets access to the + full 64-bit register file and wide data path while leaving + pointers at 32 bits for smaller memory footprint. + + You will need a recent binutils (2.22 or later) with + elf32_x86_64 support enabled to compile a kernel with this + option set. + config COMPAT def_bool y - depends on IA32_EMULATION + depends on IA32_EMULATION || X86_X32_ABI config COMPAT_FOR_U64_ALIGNMENT def_bool COMPAT -- cgit v1.2.3 From 1a21d4e095ef720abf81299000afc038206d571b Mon Sep 17 00:00:00 2001 From: "H. J. Lu" Date: Sun, 19 Feb 2012 11:38:06 -0800 Subject: x32: Add x32 VDSO support Add support for the x32 VDSO. The x32 VDSO takes advantage of the similarity between the x86-64 and the x32 ABIs to contain the same content, only the container is different, as the x32 VDSO obviously is an x32 shared object. Signed-off-by: H. Peter Anvin --- arch/x86/vdso/.gitignore | 2 ++ arch/x86/vdso/Makefile | 46 +++++++++++++++++++++++++- arch/x86/vdso/vdso32-setup.c | 6 ++++ arch/x86/vdso/vdsox32.S | 22 +++++++++++++ arch/x86/vdso/vdsox32.lds.S | 32 ++++++++++++++++++ arch/x86/vdso/vma.c | 78 +++++++++++++++++++++++++++++++++++++++----- 6 files changed, 177 insertions(+), 9 deletions(-) create mode 100644 arch/x86/vdso/vdsox32.S create mode 100644 arch/x86/vdso/vdsox32.lds.S (limited to 'arch/x86') diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore index 60274d5746e1..3282874bc61d 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/vdso/.gitignore @@ -1,5 +1,7 @@ vdso.lds vdso-syms.lds +vdsox32.lds +vdsox32-syms.lds vdso32-syms.lds vdso32-syscall-syms.lds vdso32-sysenter-syms.lds diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 5d179502a52c..fd14be1d1472 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -3,21 +3,29 @@ # VDSO64-$(CONFIG_X86_64) := y +VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y vdso-install-$(VDSO64-y) += vdso.so +vdso-install-$(VDSOX32-y) += vdsox32.so vdso-install-$(VDSO32-y) += $(vdso32-images) # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs-$(VDSOX32-y) += $(vobjx32s-compat) + +# Filter out x32 objects. +vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) + # files to link into kernel obj-$(VDSO64-y) += vma.o vdso.o +obj-$(VDSOX32-y) += vdsox32.o obj-$(VDSO32-y) += vdso32.o vdso32-setup.o -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) +vobjs := $(foreach F,$(vobj64s),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so @@ -72,6 +80,42 @@ endef $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE $(call if_changed,vdsosym) +# +# X32 processes use x32 vDSO to access 64bit kernel data. +# +# Build x32 vDSO image: +# 1. Compile x32 vDSO as 64bit. +# 2. Convert object files to x32. +# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes +# so that it can reach 64bit address space with 64bit pointers. +# + +targets += vdsox32-syms.lds +obj-$(VDSOX32-y) += vdsox32-syms.lds + +CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ + -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 \ + -Wl,-z,common-page-size=4096 + +vobjx32s-y := $(vobj64s:.o=-x32.o) +vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) + +# Convert 64bit object file to x32 for x32 vDSO. +quiet_cmd_x32 = X32 $@ + cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ + +$(obj)/%-x32.o: $(obj)/%.o FORCE + $(call if_changed,x32) + +targets += vdsox32.so vdsox32.so.dbg vdsox32.lds $(vobjx32s-y) + +$(obj)/vdsox32.o: $(src)/vdsox32.S $(obj)/vdsox32.so + +$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE + $(call if_changed,vdso) + # # Build multiple 32-bit vDSO images to choose from at boot time. # diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 468d591dde31..01b8a0df5e0e 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -317,6 +317,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) int ret = 0; bool compat; +#ifdef CONFIG_X86_X32_ABI + extern int x32_setup_additional_pages(struct linux_binprm *, int); + if (test_thread_flag(TIF_X32)) + return x32_setup_additional_pages (bprm, uses_interp); +#endif + if (vdso_enabled == VDSO_DISABLED) return 0; diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S new file mode 100644 index 000000000000..d6b9a7f42a8a --- /dev/null +++ b/arch/x86/vdso/vdsox32.S @@ -0,0 +1,22 @@ +#include +#include +#include + +__PAGE_ALIGNED_DATA + + .globl vdsox32_start, vdsox32_end + .align PAGE_SIZE +vdsox32_start: + .incbin "arch/x86/vdso/vdsox32.so" +vdsox32_end: + .align PAGE_SIZE /* extra data here leaks to userspace. */ + +.previous + + .globl vdsox32_pages + .bss + .align 8 + .type vdsox32_pages, @object +vdsox32_pages: + .zero (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 + .size vdsox32_pages, .-vdsox32_pages diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S new file mode 100644 index 000000000000..373ca9a02a53 --- /dev/null +++ b/arch/x86/vdso/vdsox32.lds.S @@ -0,0 +1,32 @@ +/* + * Linker script for x32 vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0 +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} + +VDSOX32_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 153407c35b75..1bbcc6205ace 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -24,7 +24,44 @@ extern unsigned short vdso_sync_cpuid; extern struct page *vdso_pages[]; static unsigned vdso_size; -static void __init patch_vdso(void *vdso, size_t len) +#ifdef CONFIG_X86_X32_ABI +extern char vdsox32_start[], vdsox32_end[]; +extern struct page *vdsox32_pages[]; +static unsigned vdsox32_size; + +static void __init patch_vdsox32(void *vdso, size_t len) +{ + Elf32_Ehdr *hdr = vdso; + Elf32_Shdr *sechdrs, *alt_sec = 0; + char *secstrings; + void *alt_data; + int i; + + BUG_ON(len < sizeof(Elf32_Ehdr)); + BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); + + sechdrs = (void *)hdr + hdr->e_shoff; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf32_Shdr *shdr = &sechdrs[i]; + if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { + alt_sec = shdr; + goto found; + } + } + + /* If we get here, it's probably a bug. */ + pr_warning("patch_vdsox32: .altinstructions not found\n"); + return; /* nothing to patch */ + +found: + alt_data = (void *)hdr + alt_sec->sh_offset; + apply_alternatives(alt_data, alt_data + alt_sec->sh_size); +} +#endif + +static void __init patch_vdso64(void *vdso, size_t len) { Elf64_Ehdr *hdr = vdso; Elf64_Shdr *sechdrs, *alt_sec = 0; @@ -47,7 +84,7 @@ static void __init patch_vdso(void *vdso, size_t len) } /* If we get here, it's probably a bug. */ - pr_warning("patch_vdso: .altinstructions not found\n"); + pr_warning("patch_vdso64: .altinstructions not found\n"); return; /* nothing to patch */ found: @@ -60,12 +97,20 @@ static int __init init_vdso(void) int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; - patch_vdso(vdso_start, vdso_end - vdso_start); + patch_vdso64(vdso_start, vdso_end - vdso_start); vdso_size = npages << PAGE_SHIFT; for (i = 0; i < npages; i++) vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); +#ifdef CONFIG_X86_X32_ABI + patch_vdsox32(vdsox32_start, vdsox32_end - vdsox32_start); + npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; + vdsox32_size = npages << PAGE_SHIFT; + for (i = 0; i < npages; i++) + vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE); +#endif + return 0; } subsys_initcall(init_vdso); @@ -103,7 +148,10 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int setup_additional_pages(struct linux_binprm *bprm, + int uses_interp, + struct page **pages, + unsigned size) { struct mm_struct *mm = current->mm; unsigned long addr; @@ -113,8 +161,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return 0; down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, vdso_size); - addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); + addr = vdso_addr(mm->start_stack, size); + addr = get_unmapped_area(NULL, addr, size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; @@ -122,11 +170,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) current->mm->context.vdso = (void *)addr; - ret = install_special_mapping(mm, addr, vdso_size, + ret = install_special_mapping(mm, addr, size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, - vdso_pages); + pages); if (ret) { current->mm->context.vdso = NULL; goto up_fail; @@ -137,6 +185,20 @@ up_fail: return ret; } +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return setup_additional_pages (bprm, uses_interp, vdso_pages, + vdso_size); +} + +#ifdef CONFIG_X86_X32_ABI +int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return setup_additional_pages (bprm, uses_interp, vdsox32_pages, + vdsox32_size); +} +#endif + static __init int vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); -- cgit v1.2.3 From a38449ef596b345e13a8f9b7d5cd9fedb8fcf921 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Feb 2012 15:29:34 -0500 Subject: x86: Specify a size for the cmp in the NMI handler Linus noticed that the cmp used to check if the code segment is __KERNEL_CS or not did not specify a size. Perhaps it does not matter as H. Peter Anvin noted that user space can not set the bottom two bits of the %cs register. But it's best not to let the assembly choose and change things between different versions of gas, but instead just pick the size. Four bytes are used to compare the saved code segment against __KERNEL_CS. Perhaps this might mess up Xen, but we can fix that when the time comes. Also I noticed that there was another non-specified cmp that checks the special stack variable if it is 1 or 0. This too probably doesn't matter what cmp is used, but this patch uses cmpl just to make it non ambiguous. Link: http://lkml.kernel.org/r/CA+55aFxfAn9MWRgS3O5k2tqN5ys1XrhSFVO5_9ZAoZKDVgNfGA@mail.gmail.com Suggested-by: Linus Torvalds Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index debd851de6ff..1333d9851778 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1535,14 +1535,14 @@ ENTRY(nmi) * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmp $__KERNEL_CS, 16(%rsp) + cmpl $__KERNEL_CS, 16(%rsp) jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmp $1, -8(%rsp) + cmpl $1, -8(%rsp) je nested_nmi /* -- cgit v1.2.3 From 27e74da9800289e69ba907777df1e2085231eff7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 19:34:10 -0800 Subject: i387: export 'fpu_owner_task' per-cpu variable (And define it properly for x86-32, which had its 'current_task' declaration in separate from x86-64) Bitten by my dislike for modules on the machines I use, and the fact that apparently nobody else actually wanted to test the patches I sent out. Snif. Nobody else cares. Anyway, we probably should uninline the 'kernel_fpu_begin()' function that is what modules actually use and that references this, but this is the minimal fix for now. Reported-by: Josh Boyer Reported-and-tested-by: Jongman Heo Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b667148dfad7..c0f7d68d318f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,6 +1045,7 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1113,6 +1114,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); -- cgit v1.2.3 From 8546c008924d5fd1724fa698eaa92b414bafd50d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 10:25:45 -0800 Subject: i387: Uninline the generic FP helpers that we expose to kernel modules Instead of exporting the very low-level internals of the FPU state save/restore code (ie things like 'fpu_owner_task'), we should export the higher-level interfaces. Inlining these things is pointless anyway: sure, sometimes the end result is small, but while 'stts()' can result in just three x86 instructions, those are not cheap instructions (writing %cr0 is a serializing instruction and a very slow one at that). So the overhead of a function call is not noticeable, and we really don't want random modules mucking about with our internal state save logic anyway. So this unexports 'fpu_owner_task', and instead uninlines and exports the actual functions that modules can use: fpu_kernel_begin/end() and unlazy_fpu(). Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211339590.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 78 +++--------------------------------------- arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/i387.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 247904945d3f..0c1031d354f2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -419,70 +419,9 @@ static inline void __clear_fpu(struct task_struct *tsk) } } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - */ -static inline bool interrupted_kernel_fpu_idle(void) -{ - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static inline bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); -} - -/* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. - */ -static inline bool irq_fpu_usable(void) -{ - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); -} - -static inline void kernel_fpu_begin(void) -{ - struct task_struct *me = current; - - WARN_ON_ONCE(!irq_fpu_usable()); - preempt_disable(); - if (__thread_has_fpu(me)) { - __save_init_fpu(me); - __thread_clear_has_fpu(me); - /* We do 'stts()' in kernel_fpu_end() */ - } else { - percpu_write(fpu_owner_task, NULL); - clts(); - } -} - -static inline void kernel_fpu_end(void) -{ - stts(); - preempt_enable(); -} +extern bool irq_fpu_usable(void); +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); /* * Some instructions like VIA's padlock instructions generate a spurious @@ -566,16 +505,7 @@ static inline void save_init_fpu(struct task_struct *tsk) preempt_enable(); } -static inline void unlazy_fpu(struct task_struct *tsk) -{ - preempt_disable(); - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; - preempt_enable(); -} +extern void unlazy_fpu(struct task_struct *tsk); static inline void clear_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318f..cb71b01ab66e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,7 +1045,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1115,7 +1114,6 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d8598f789..17b7549c4134 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -32,6 +32,86 @@ # define user32_fxsr_struct user_fxsr_struct #endif +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + __thread_clear_has_fpu(me); + /* We do 'stts()' in kernel_fpu_end() */ + } else { + percpu_write(fpu_owner_task, NULL); + clts(); + } +} +EXPORT_SYMBOL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + stts(); + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; + preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); + #ifdef CONFIG_MATH_EMULATION # define HAVE_HWFP (boot_cpu_data.hard_math) #else -- cgit v1.2.3 From 1361b83a13d4d92e53fbb6c877528713e118b821 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 13:19:22 -0800 Subject: i387: Split up into exported and internal interfaces While various modules include to get access to things we actually *intend* for them to use, most of that header file was really pretty low-level internal stuff that we really don't want to expose to others. So split the header file into two: the small exported interfaces remain in , while the internal definitions that are only used by core architecture code are now in . The guiding principle for this was to expose functions that we export to modules, and leave them in , while stuff that is used by task switching or was marked GPL-only is in . The fpu-internal.h file could be further split up too, especially since arch/x86/kvm/ uses some of the remaining stuff for its module. But that kvm usage should probably be abstracted out a bit, and at least now the internal FPU accessor functions are much more contained. Even if it isn't perhaps as contained as it _could_ be. Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211340330.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 1 + arch/x86/include/asm/fpu-internal.h | 520 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/i387.h | 512 +---------------------------------- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/i387.c | 3 +- arch/x86/kernel/process.c | 1 + arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/signal.c | 1 + arch/x86/kernel/traps.c | 1 + arch/x86/kernel/xsave.c | 1 + arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 1 + arch/x86/power/cpu.c | 1 + 15 files changed, 540 insertions(+), 508 deletions(-) create mode 100644 arch/x86/include/asm/fpu-internal.h (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 65577698cab2..5563ba1cf513 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h new file mode 100644 index 000000000000..4fa88154e4de --- /dev/null +++ b/arch/x86/include/asm/fpu-internal.h @@ -0,0 +1,520 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _FPU_INTERNAL_H +#define _FPU_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern unsigned int sig_xstate_size; +extern void fpu_init(void); + +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + +extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active + +extern struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +extern unsigned int sig_xstate_ia32_size; +extern struct _fpx_sw_bytes fx_sw_reserved_ia32; +struct _fpstate_ia32; +struct _xstate_ia32; +extern int save_i387_xstate_ia32(void __user *buf); +extern int restore_i387_xstate_ia32(void __user *buf); +#endif + +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + +#define X87_FSW_ES (1 << 7) /* Exception Summary */ + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has(X86_FEATURE_FXSR); +} + +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + +#ifdef CONFIG_X86_64 +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + int err; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxrstorq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "m" (*fx), "0" (0)); +#else + asm volatile("1: rex64/fxrstor (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "R" (fx), "m" (*fx), "0" (0)); +#endif + return err; +} + +static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +{ + int err; + + /* + * Clear the bytes not touched by the fxsave and reserved + * for the SW usage. + */ + err = __clear_user(&fx->sw_reserved, + sizeof(struct _fpx_sw_bytes)); + if (unlikely(err)) + return -EFAULT; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxsaveq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), [fx] "=m" (*fx) + : "0" (0)); +#else + asm volatile("1: rex64/fxsave (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=m" (*fx) + : [fx] "R" (fx), "0" (0)); +#endif + if (unlikely(err) && + __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; + /* No need to clear here because the caller clears USED_MATH */ + return err; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + /* Using "rex64; fxsave %0" is broken because, if the memory operand + uses any extended registers for addressing, a second REX prefix + will be generated (to the assembler, rex64 followed by semicolon + is a separate instruction), and hence the 64-bitness is lost. */ + +#ifdef CONFIG_AS_FXSAVEQ + /* Using "fxsaveq %0" would be the ideal choice, but is only supported + starting with gas 2.16. */ + __asm__ __volatile__("fxsaveq %0" + : "=m" (fpu->state->fxsave)); +#else + /* Using, as a workaround, the properly prefixed form below isn't + accepted by any binutils version so far released, complaining that + the same type of prefix is used twice if an extended register is + needed for addressing (fix submitted to mainline 2005-11-21). + asm volatile("rex64/fxsave %0" + : "=m" (fpu->state->fxsave)); + This, however, we can work around by forcing the compiler to select + an addressing mode that doesn't require extended registers. */ + asm volatile("rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); +#endif +} + +#else /* CONFIG_X86_32 */ + +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + /* + * The "nop" is needed to make the instructions the same + * length. + */ + alternative_input( + "nop ; frstor %1", + "fxrstor %1", + X86_FEATURE_FXSR, + "m" (*fx)); + + return 0; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + asm volatile("fxsave %[fx]" + : [fx] "=m" (fpu->state->fxsave)); +} + +#endif /* CONFIG_X86_64 */ + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. + */ +static inline int fpu_save_init(struct fpu *fpu) +{ + if (use_xsave()) { + fpu_xsave(fpu); + + /* + * xsave header may indicate the init state of the FP. + */ + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return 1; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fnsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return 0; + } + + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { + asm volatile("fnclex"); + return 0; + } + return 1; +} + +static inline int __save_init_fpu(struct task_struct *tsk) +{ + return fpu_save_init(&tsk->thread.fpu); +} + +static inline int fpu_fxrstor_checking(struct fpu *fpu) +{ + return fxrstor_checking(&fpu->state->fxsave); +} + +static inline int fpu_restore_checking(struct fpu *fpu) +{ + if (use_xsave()) + return fpu_xrstor_checking(fpu); + else + return fpu_fxrstor_checking(fpu); +} + +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.fpu.has_fpu)); + + return fpu_restore_checking(&tsk->thread.fpu); +} + +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct task_struct *tsk) +{ + return tsk->thread.fpu.has_fpu; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct task_struct *tsk) +{ + __thread_clear_has_fpu(tsk); + stts(); +} + +static inline void __thread_fpu_begin(struct task_struct *tsk) +{ + clts(); + __thread_set_has_fpu(tsk); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new->fpu_counter++; + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; + if (fpu.preload) { + new->fpu_counter++; + if (fpu_lazy_restore(new, cpu)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) +{ + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } +} + +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + +static inline void __clear_fpu(struct task_struct *tsk) +{ + if (__thread_has_fpu(tsk)) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(tsk); + } +} + +/* + * The actual user_fpu_begin/end() functions + * need to be preemption-safe. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline void user_fpu_end(void) +{ + preempt_disable(); + __thread_fpu_end(current); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) + __thread_fpu_begin(current); + preempt_enable(); +} + +/* + * These disable preemption on their own and are safe + */ +static inline void save_init_fpu(struct task_struct *tsk) +{ + WARN_ON_ONCE(!__thread_has_fpu(tsk)); + preempt_disable(); + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + preempt_enable(); +} + +static inline void clear_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __clear_fpu(tsk); + preempt_enable(); +} + +/* + * i387 state interaction + */ +static inline unsigned short get_fpu_cwd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.cwd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.swd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) +{ + if (cpu_has_xmm) { + return tsk->thread.fpu.state->fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +static bool fpu_allocated(struct fpu *fpu) +{ + return fpu->state != NULL; +} + +static inline int fpu_alloc(struct fpu *fpu) +{ + if (fpu_allocated(fpu)) + return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + WARN_ON((unsigned long)fpu->state & 15); + return 0; +} + +static inline void fpu_free(struct fpu *fpu) +{ + if (fpu->state) { + kmem_cache_free(task_xstate_cachep, fpu->state); + fpu->state = NULL; + } +} + +static inline void fpu_copy(struct fpu *dst, struct fpu *src) +{ + memcpy(dst->state, src->state, xstate_size); +} + +extern void fpu_finit(struct fpu *fpu); + +#endif diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0c1031d354f2..7ce0798b1b26 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -13,411 +13,15 @@ #ifndef __ASSEMBLY__ #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +struct pt_regs; +struct user_i387_struct; -extern unsigned int sig_xstate_size; -extern void fpu_init(void); -extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); - -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - int err; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxrstorq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "m" (*fx), "0" (0)); -#else - asm volatile("1: rex64/fxrstor (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif - return err; -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - int err; - - /* - * Clear the bytes not touched by the fxsave and reserved - * for the SW usage. - */ - err = __clear_user(&fx->sw_reserved, - sizeof(struct _fpx_sw_bytes)); - if (unlikely(err)) - return -EFAULT; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), [fx] "=m" (*fx) - : "0" (0)); -#else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=m" (*fx) - : [fx] "R" (fx), "0" (0)); -#endif - if (unlikely(err) && - __clear_user(fx, sizeof(struct i387_fxsave_struct))) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ - return err; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - /* Using "rex64; fxsave %0" is broken because, if the memory operand - uses any extended registers for addressing, a second REX prefix - will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ - -#ifdef CONFIG_AS_FXSAVEQ - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#else - /* Using, as a workaround, the properly prefixed form below isn't - accepted by any binutils version so far released, complaining that - the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). - asm volatile("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); - This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - asm volatile("rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); -#endif -} - -#else /* CONFIG_X86_32 */ - -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - /* - * The "nop" is needed to make the instructions the same - * length. - */ - alternative_input( - "nop ; frstor %1", - "fxrstor %1", - X86_FEATURE_FXSR, - "m" (*fx)); - - return 0; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - asm volatile("fxsave %[fx]" - : [fx] "=m" (fpu->state->fxsave)); -} - -#endif /* CONFIG_X86_64 */ - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - fpu_xsave(fpu); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ - return fxrstor_checking(&fpu->state->fxsave); -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(fpu); - else - return fpu_fxrstor_checking(fpu); -} - -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.fpu.has_fpu)); - - return fpu_restore_checking(&tsk->thread.fpu); -} - -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 0; - percpu_write(fpu_owner_task, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 1; - percpu_write(fpu_owner_task, tsk); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct task_struct *tsk) -{ - __thread_clear_has_fpu(tsk); - stts(); -} - -static inline void __thread_fpu_begin(struct task_struct *tsk) -{ - clts(); - __thread_set_has_fpu(tsk); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -/* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. - * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. - */ -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == percpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) -{ - fpu_switch_t fpu; - - fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; - if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new->fpu_counter++; - __thread_set_has_fpu(new); - prefetch(new->thread.fpu.state); - } else - stts(); - } else { - old->fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; - if (fpu.preload) { - new->fpu_counter++; - if (fpu_lazy_restore(new, cpu)) - fpu.preload = 0; - else - prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) -{ - if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) - __thread_fpu_end(new); - } -} - -/* - * Signal frame handlers... - */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); - -static inline void __clear_fpu(struct task_struct *tsk) -{ - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } -} +extern void math_state_restore(void); extern bool irq_fpu_usable(void); extern void kernel_fpu_begin(void); @@ -463,118 +67,14 @@ static inline void irq_ts_restore(int TS_state) * we can just assume we have FPU access - typically * to save the FP state - we'll just take a #NM * fault and get the FPU access back. - * - * The actual user_fpu_begin/end() functions - * need to be preemption-safe, though. - * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current); -} - -static inline void user_fpu_end(void) -{ - preempt_disable(); - __thread_fpu_end(current); - preempt_enable(); -} - -static inline void user_fpu_begin(void) -{ - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(current); - preempt_enable(); -} - -/* - * These disable preemption on their own and are safe - */ -static inline void save_init_fpu(struct task_struct *tsk) -{ - WARN_ON_ONCE(!__thread_has_fpu(tsk)); - preempt_disable(); - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - preempt_enable(); + return current->thread.fpu.has_fpu; } extern void unlazy_fpu(struct task_struct *tsk); -static inline void clear_fpu(struct task_struct *tsk) -{ - preempt_disable(); - __clear_fpu(tsk); - preempt_enable(); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu_allocated(fpu)) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} - -static inline void fpu_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - -static inline void fpu_copy(struct fpu *dst, struct fpu *src) -{ - memcpy(dst->state, src->state, xstate_size); -} - -extern void fpu_finit(struct fpu *fpu); - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb71b01ab66e..89620b1725d4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 17b7549c4134..7734bcbb5a3a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_64 @@ -124,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; -void __cpuinit mxcsr_feature_mask_init(void) +static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe3..c38d84e01022 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,6 +21,7 @@ #include #include #include +#include #include struct kmem_cache *task_xstate_cachep; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..ee32dee7a0a3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #ifdef CONFIG_MATH_EMULATION #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..5bad3c71e48f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b766..78f05e438be5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e2..25edcfc9ba5b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d96744..ec61d4c1b93b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 711091114119..e62728e30b01 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION #include #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad906..246490f643b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current)) + if (user_has_fpu()) clts(); load_gdt(&__get_cpu_var(host_gdt)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..b937b6179d80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -57,6 +57,7 @@ #include #include #include +#include /* Ugh! */ #include #include #include diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb4..4889655ba784 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include /* pcntxt_mask */ #ifdef CONFIG_X86_32 static struct saved_context saved_context; -- cgit v1.2.3 From 22e842d4d90ffec9677cc114487a5cefd39b5643 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 21 Feb 2012 14:32:19 -0800 Subject: x32: Fix coding style violations in the x32 VDSO code Move the prototype for x32_setup_additional_pages() to a header file, and adjust the coding style to match standard. Signed-off-by: H. Peter Anvin Cc: H. J. Lu --- arch/x86/include/asm/elf.h | 2 ++ arch/x86/vdso/vdso32-setup.c | 3 +-- arch/x86/vdso/vma.c | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 83aabea95dd7..1e40634591a4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -331,6 +331,8 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); +extern int x32_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 01b8a0df5e0e..10f9f59477db 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -318,9 +318,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) bool compat; #ifdef CONFIG_X86_X32_ABI - extern int x32_setup_additional_pages(struct linux_binprm *, int); if (test_thread_flag(TIF_X32)) - return x32_setup_additional_pages (bprm, uses_interp); + return x32_setup_additional_pages(bprm, uses_interp); #endif if (vdso_enabled == VDSO_DISABLED) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1bbcc6205ace..d7dce1dbf8c9 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -187,15 +187,15 @@ up_fail: int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages (bprm, uses_interp, vdso_pages, - vdso_size); + return setup_additional_pages(bprm, uses_interp, vdso_pages, + vdso_size); } #ifdef CONFIG_X86_X32_ABI int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages (bprm, uses_interp, vdsox32_pages, - vdsox32_size); + return setup_additional_pages(bprm, uses_interp, vdsox32_pages, + vdsox32_size); } #endif -- cgit v1.2.3 From 513c4ec6e4759aa33c90af0658b82eb4d2027871 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 21 Feb 2012 17:25:50 -0800 Subject: x86, cpufeature: Add CPU features from Intel document 319433-012A Add CPU features from the Intel Archicture Instruction Set Extensions Programming Reference version 012A (Feb 2012), document number 319433-012A. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8d67d428b0f9..0d3dcc9cbab6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -199,10 +199,13 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3 From b0e5c77903fd717cc5eb02b7b8f5de3c869efc49 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 6 Feb 2012 18:32:20 -0800 Subject: x86/tsc: Reduce the TSC sync check time for core-siblings For each logical CPU that is coming online, we spend 20msec for checking the TSC synchronization. And as this is done sequentially for each logical CPU boot, this time gets added up depending on the number of logical CPU's supported by the platform. Minimize this by using the socket topology information. If the target CPU coming online doesn't have any of its core-siblings online, a timeout of 20msec will be used for the TSC-warp measurement loop. Otherwise a smaller timeout of 2msec will be used, as we have some information about this socket already (and this information grows as we have more and more logical-siblings in that socket). Ideally we should be able to skip the TSC sync check on the other core-siblings, if the first logical CPU in a socket passed the sync test. But as the TSC is per-logical CPU and can potentially be modified wrongly by the bios before the OS boot, TSC sync test for smaller duration should be able to catch such errors. Also this will catch the condition where all the cores in the socket doesn't get reset at the same time. For example, with this modification, time spent in TSC sync checks on a 4 socket 10-core with HT system gets reduced from 1580msec to 212msec. Signed-off-by: Suresh Siddha Acked-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Jack Steiner Cc: venki@google.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1328581940.29790.20.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9eba29b46cb7..fc25e60a5884 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps; /* * TSC-warp measurement loop running on both CPUs: */ -static __cpuinit void check_tsc_warp(void) +static __cpuinit void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; @@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(void) start = get_cycles(); rdtsc_barrier(); /* - * The measurement runs for 20 msecs: + * The measurement runs for 'timeout' msecs: */ - end = start + tsc_khz * 20ULL; + end = start + (cycles_t) tsc_khz * timeout; now = start; for (i = 0; ; i++) { @@ -98,6 +98,25 @@ static __cpuinit void check_tsc_warp(void) now-start, end-start); } +/* + * If the target CPU coming online doesn't have any of its core-siblings + * online, a timeout of 20msec will be used for the TSC-warp measurement + * loop. Otherwise a smaller timeout of 2msec will be used, as we have some + * information about this socket already (and this information grows as we + * have more and more logical-siblings in that socket). + * + * Ideally we should be able to skip the TSC sync check on the other + * core-siblings, if the first logical CPU in a socket passed the sync test. + * But as the TSC is per-logical CPU and can potentially be modified wrongly + * by the bios, TSC sync test for smaller duration should be able + * to catch such errors. Also this will catch the condition where all the + * cores in the socket doesn't get reset at the same time. + */ +static inline unsigned int loop_timeout(int cpu) +{ + return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; +} + /* * Source CPU calls into this - it waits for the freshly booted * target CPU to arrive and then starts the measurement: @@ -135,7 +154,7 @@ void __cpuinit check_tsc_sync_source(int cpu) */ atomic_inc(&start_count); - check_tsc_warp(); + check_tsc_warp(loop_timeout(cpu)); while (atomic_read(&stop_count) != cpus-1) cpu_relax(); @@ -183,7 +202,7 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(); + check_tsc_warp(loop_timeout(smp_processor_id())); /* * Ok, we are done: -- cgit v1.2.3 From 3f806e50981825fa56a7f1938f24c0680816be45 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Feb 2012 20:18:01 +0100 Subject: x86/mce/AMD: Fix UP build error 141168c36cde ("x86: Simplify code by removing a !SMP #ifdefs from 'struct cpuinfo_x86'") removed a bunch of CONFIG_SMP ifdefs around code touching struct cpuinfo_x86 members but also caused the following build error with Randy's randconfigs: mce_amd.c:(.cpuinit.text+0x4723): undefined reference to `cpu_llc_shared_map' Restore the #ifdef in threshold_create_bank() which creates symlinks on the non-BSP CPUs. There's a better patch series being worked on by Kevin Winchester which will solve this in a cleaner fashion, but that series is too ambitious for v3.3 merging - so we first queue up this trivial fix and then do the rest for v3.4. Signed-off-by: Borislav Petkov Acked-by: Kevin Winchester Cc: Randy Dunlap Cc: Nick Bowler Link: http://lkml.kernel.org/r/20120203191801.GA2846@x1.osrc.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..e4eeaaf58a47 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -528,6 +528,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -553,6 +554,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } +#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { -- cgit v1.2.3 From 140f190bc3a3b6f200548d204befd998eadd63fd Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 22 Feb 2012 10:06:44 -0800 Subject: x86: Remove some noise from boot log when starting cpus Printing the "start_ip" for every secondary cpu is very noisy on a large system - and doesn't add any value. Drop this message. Console log before: Booting Node 0, Processors #1 smpboot cpu 1: start_ip = 96000 #2 smpboot cpu 2: start_ip = 96000 #3 smpboot cpu 3: start_ip = 96000 #4 smpboot cpu 4: start_ip = 96000 ... #31 smpboot cpu 31: start_ip = 96000 Brought up 32 CPUs Console log after: Booting Node 0, Processors #1 #2 #3 #4 #5 #6 #7 Ok. Booting Node 1, Processors #8 #9 #10 #11 #12 #13 #14 #15 Ok. Booting Node 0, Processors #16 #17 #18 #19 #20 #21 #22 #23 Ok. Booting Node 1, Processors #24 #25 #26 #27 #28 #29 #30 #31 Brought up 32 CPUs Acked-by: Borislav Petkov Signed-off-by: Tony Luck Link: http://lkml.kernel.org/r/4f452eb42507460426@agluck-desktop.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..683575250a65 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -740,8 +740,6 @@ do_rest: * the targeted processor. */ - printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); - atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { -- cgit v1.2.3 From d6126ef5f31ca54980cb067af659a360dfcca037 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 26 Jan 2012 15:49:14 -0800 Subject: x86/mce: Convert static array of pointers to per-cpu variables When I previously fixed up the mce_device code, I used a static array of the pointers. It was (rightfully) pointed out to me that I should be using the per_cpu code instead. This patch converts the code over to that structure, moving the variable back into the per_cpu area, like it used to be for 3.2 and earlier. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Srivatsa S. Bhat Link: https://lkml.org/lkml/2012/1/27/165 Signed-off-by: Tony Luck --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6aefb14cbbc5..441520e4174f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -extern struct device *mce_device[CONFIG_NR_CPUS]; +DECLARE_PER_CPU(struct device *, mce_device); /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..4979a5dfeba2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2038,7 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); - mce_device[cpu] = dev; + per_cpu(mce_device, cpu) = dev; return 0; error2: @@ -2055,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2069,7 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); - mce_device[cpu] = NULL; + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a86322..a4bf9d23cdba 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -585,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) @@ -665,7 +665,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&mce_device[cpu]->kobj, name); + dev = per_cpu(mce_device, cpu); + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -677,7 +678,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; -- cgit v1.2.3 From fadd85f16a8ec3fee8af599e79a209682dc52348 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 23 Jan 2012 15:54:52 -0500 Subject: x86/mce: Fix return value of mce_chrdev_read() when erst is disabled Current kernel MCE code reads ERST at the first reading of /dev/mcelog (maybe in starting mcelogd,) even if the system does not support ERST, which results in a fake "no such device" message (as described in [1].) This problem is not critical, but can confuse system admins. This patch fixes it by filtering the return value from lower (ACPI) layer. [1] http://thread.gmane.org/gmane.linux.kernel/1060250 Reported by: Jon Masters Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Huang Ying Link: https://lkml.org/lkml/2012/1/23/299 Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4979a5dfeba2..87c56ba8080c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1541,6 +1541,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; -- cgit v1.2.3 From 862ae3132dc393ab6ea750b9ee9e0e1c276b9abb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 22 Feb 2012 20:37:10 -0800 Subject: x32: Drop non-__vdso weak symbols from the x32 VDSO Drop the legacy weak symbols that don't carry the __vdso prefix from the x32 VDSO. This is a new ABI and we don't need to support that legacy; the actual libc will export the proper symbols. Suggested-by: Andy Lutomirski Link: http://lkml.kernel.org/r/4F42E171.9080005@mit.edu Cc: H. J. Lu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdsox32.lds.S | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S index 373ca9a02a53..62272aa2ae0a 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/vdso/vdsox32.lds.S @@ -17,13 +17,9 @@ VERSION { LINUX_2.6 { global: - clock_gettime; __vdso_clock_gettime; - gettimeofday; __vdso_gettimeofday; - getcpu; __vdso_getcpu; - time; __vdso_time; local: *; }; -- cgit v1.2.3 From 1cc1c96c1658bfaf85d06d764bd7ac00640ae90f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 20 Feb 2012 17:23:47 -0800 Subject: PCI: fix memleak when ACPI _CRS is not used. warning: unreferenced object 0xffff8801f6914200 (size 512): comm "swapper/0", pid 1, jiffies 4294893643 (age 2664.644s) hex dump (first 32 bytes): 00 00 c0 fe 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 60 58 2f f6 03 88 ff ff 00 02 00 00 00 00 00 00 `X/............. backtrace: [] kmemleak_alloc+0x26/0x43 [] __kmalloc+0x121/0x183 [] get_current_resources+0x5a/0xc6 [] pci_acpi_scan_root+0x13c/0x21c [] acpi_pci_root_add+0x1e1/0x421 [] acpi_device_probe+0x50/0x190 [] really_probe+0x99/0x126 [] driver_probe_device+0x3b/0x56 [] __driver_attach+0x5f/0x82 [] bus_for_each_dev+0x5c/0x88 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xca/0x21d [] driver_register+0x91/0xfe [] acpi_bus_register_driver+0x43/0x45 [] acpi_pci_root_init+0x20/0x28 [] do_one_initcall+0x57/0x134 The system has _CRS for root buses, but they are not used because the machine date is before the cutoff date for _CRS usage. Try to free those unused resource arrays and names. Reviewed-by: Bjorn Helgaas Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a312e76063a7..c33e0970ee9f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -282,9 +282,6 @@ static void add_resources(struct pci_root_info *info) int i; struct resource *res, *root, *conflict; - if (!pci_use_crs) - return; - coalesce_windows(info, IORESOURCE_MEM); coalesce_windows(info, IORESOURCE_IO); @@ -336,8 +333,13 @@ get_current_resources(struct acpi_device *device, int busnum, acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); - add_resources(&info); - return; + if (pci_use_crs) { + add_resources(&info); + + return; + } + + kfree(info.name); name_alloc_fail: kfree(info.res); -- cgit v1.2.3 From 990a30c50c2bb3c4570aec7c33bedb969d089b7b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 13 Feb 2012 12:59:00 +0000 Subject: x86/mrst/pci: assign d3_delay to 0 for Langwell devices Langwell devices are not true pci devices, they are not subject to the 10 ms d3 to d0 delay required by pci spec. This patch assigns d3_delay to 0 for all langwell pci devices. We can also power off devices that are not really used by the OS Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Jesse Barnes --- arch/x86/pci/mrst.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index cb29191cee58..89e55485c787 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -239,6 +239,30 @@ int __init pci_mrst_init(void) return 1; } +/* Langwell devices are not true pci devices, they are not subject to 10 ms + * d3 to d0 delay required by pci spec. + */ +static void __devinit pci_d3delay_fixup(struct pci_dev *dev) +{ + /* true pci devices in lincroft should allow type 1 access, the rest + * are langwell fake pci devices. + */ + if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) + return; + dev->d3_delay = 0; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); + +static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) +{ + pci_set_power_state(dev, PCI_D3cold); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev); + /* * Langwell devices reside at fixed offsets, don't try to move them. */ -- cgit v1.2.3 From 8ed3087280ee8c527b7090887e333761a9c75474 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 13 Feb 2012 12:59:20 +0000 Subject: x86/mrst/pci: v4l/atomisp: treat atomisp as real pci device ATOMISP on Medfield is a real PCI device which should be handled differently than the fake PCI devices on south complex. PCI type 1 access is used for accessing config space this also has other impact such as PM D3 delay. There shouldn't be any need for reading base address from IUNIT via msg bus. Signed-off-by: Jacob Pan Signed-off-by: Artem Bityutskiy Signed-off-by: Jesse Barnes --- arch/x86/pci/mrst.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 89e55485c787..c5e81a4d7c1e 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -148,7 +148,9 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) */ if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) return 0; - if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) + || devfn == PCI_DEVFN(0, 0) + || devfn == PCI_DEVFN(3, 0))) return 1; return 0; /* langwell on others */ } -- cgit v1.2.3 From 823806ff6bd63f92644a5330cf0c3b68fac25ffd Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 13 Feb 2012 12:59:37 +0000 Subject: x86/mrst/pci: avoid SoC fixups on non-SoC platforms The PCI fixups get executed based upon whether they are linked in. We need to avoid executing them if we boot a dual SoC/PC type kernel on a PC class system. Signed-off-by: Alan Cox Signed-off-by: Jesse Barnes --- arch/x86/pci/mrst.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index c5e81a4d7c1e..140942f66b31 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -43,6 +43,8 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c +static int pci_soc_mode = 0; + /** * fixed_bar_cap - return the offset of the fixed BAR cap if found * @bus: PCI bus @@ -233,10 +235,11 @@ struct pci_ops pci_mrst_ops = { */ int __init pci_mrst_init(void) { - printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); + printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = mrst_pci_irq_enable; pci_root_ops = pci_mrst_ops; + pci_soc_mode = 1; /* Continue with standard init */ return 1; } @@ -246,6 +249,10 @@ int __init pci_mrst_init(void) */ static void __devinit pci_d3delay_fixup(struct pci_dev *dev) { + /* PCI fixups are effectively decided compile time. If we have a dual + SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ + if (!pci_soc_mode) + return; /* true pci devices in lincroft should allow type 1 access, the rest * are langwell fake pci devices. */ @@ -274,6 +281,9 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) u32 size; int i; + if (!pci_soc_mode) + return; + /* Must have extended configuration space */ if (dev->cfg_size < PCIE_CAP_OFFSET + 4) return; -- cgit v1.2.3 From b4e518547da042fdc65bd4bdafd046fed13337d5 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 16 Dec 2011 15:50:17 -0700 Subject: irq_domain/x86: Convert x86 (embedded) to use common irq_domain This patch removes the x86-specific definition of irq_domain and replaces it with the common implementation. Signed-off-by: Grant Likely Acked-by: Sebastian Andrzej Siewior Cc: Rob Herring Cc: Thomas Gleixner --- arch/x86/Kconfig | 2 + arch/x86/include/asm/irq_controller.h | 12 ---- arch/x86/include/asm/prom.h | 10 ---- arch/x86/kernel/devicetree.c | 101 ++++++++++------------------------ drivers/net/phy/mdio-gpio.c | 4 +- 5 files changed, 34 insertions(+), 95 deletions(-) delete mode 100644 arch/x86/include/asm/irq_controller.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..e0829a6a4660 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -398,6 +398,7 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE + select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -2076,6 +2077,7 @@ config OLPC select GPIOLIB select OF select OF_PROMTREE + select IRQ_DOMAIN ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h deleted file mode 100644 index 423bbbddf36d..000000000000 --- a/arch/x86/include/asm/irq_controller.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __IRQ_CONTROLLER__ -#define __IRQ_CONTROLLER__ - -struct irq_domain { - int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type); - void *priv; - struct device_node *controller; - struct list_head l; -}; - -#endif diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 644dd885f05a..60bef663609a 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -21,7 +21,6 @@ #include #include #include -#include #ifdef CONFIG_OF extern int of_ioapic; @@ -43,15 +42,6 @@ extern char cmd_line[COMMAND_LINE_SIZE]; #define pci_address_to_pio pci_address_to_pio unsigned long pci_address_to_pio(phys_addr_t addr); -/** - * irq_dispose_mapping - Unmap an interrupt - * @virq: linux virq number of the interrupt to unmap - * - * FIXME: We really should implement proper virq handling like power, - * but that's going to be major surgery. - */ -static inline void irq_dispose_mapping(unsigned int virq) { } - #define HAVE_ARCH_DEVTREE_FIXUPS #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 52821799a702..3ae2ced4a874 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -17,64 +18,14 @@ #include #include -#include #include #include __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; -static LIST_HEAD(irq_domains); -static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; -#ifdef CONFIG_X86_IO_APIC -static void add_interrupt_host(struct irq_domain *ih) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_add(&ih->l, &irq_domains); - raw_spin_unlock_irqrestore(&big_irq_lock, flags); -} -#endif - -static struct irq_domain *get_ih_from_node(struct device_node *controller) -{ - struct irq_domain *ih, *found = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_for_each_entry(ih, &irq_domains, l) { - if (ih->controller == controller) { - found = ih; - break; - } - } - raw_spin_unlock_irqrestore(&big_irq_lock, flags); - return found; -} - -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *ih; - u32 virq, type; - int ret; - - ih = get_ih_from_node(controller); - if (!ih) - return 0; - ret = ih->xlate(ih, intspec, intsize, &virq, &type); - if (ret) - return 0; - if (type == IRQ_TYPE_NONE) - return virq; - irq_set_irq_type(virq, type); - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - unsigned long pci_address_to_pio(phys_addr_t address) { /* @@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type) +static int ioapic_xlate(struct irq_domain *domain, + struct device_node *controller, + const u32 *intspec, u32 intsize, + irq_hw_number_t *out_hwirq, u32 *out_type) { - struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx, type; + u32 line, idx; + int rc; - if (intsize < 2) + if (WARN_ON(intsize < 2)) return -EINVAL; - line = *intspec; - idx = (u32) id->priv; - gsi_cfg = mp_ioapic_gsi_routing(idx); - *out_hwirq = line + gsi_cfg->gsi_base; - - intspec++; - type = *intspec; + line = intspec[0]; - if (type >= ARRAY_SIZE(of_ioapic_type)) + if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = of_ioapic_type + type; - *out_type = it->out_type; + it = &of_ioapic_type[intspec[1]]; + idx = (u32) domain->host_data; set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); + rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), + cpu_to_node(0), &attr); + if (rc) + return rc; + + *out_hwirq = line; + *out_type = it->out_type; + return 0; } +const struct irq_domain_ops ioapic_irq_domain_ops = { + .xlate = ioapic_xlate, +}; + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + + gsi_cfg = mp_ioapic_gsi_routing(i); - id = kzalloc(sizeof(*id), GFP_KERNEL); + id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, + &ioapic_irq_domain_ops, + (void*)i); BUG_ON(!id); - id->controller = np; - id->xlate = ioapic_xlate; - id->priv = (void *)i; - add_interrupt_host(id); return; } } diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 50e8e5e74465..7189adf54bd1 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -255,13 +255,13 @@ static inline int __init mdio_ofgpio_init(void) return platform_driver_register(&mdio_ofgpio_driver); } -static inline void __exit mdio_ofgpio_exit(void) +static inline void mdio_ofgpio_exit(void) { platform_driver_unregister(&mdio_ofgpio_driver); } #else static inline int __init mdio_ofgpio_init(void) { return 0; } -static inline void __exit mdio_ofgpio_exit(void) { } +static inline void mdio_ofgpio_exit(void) { } #endif /* CONFIG_OF_GPIO */ static struct platform_driver mdio_gpio_driver = { -- cgit v1.2.3 From 83e7ee6657dfcd6b0ee2406d11024b558064252a Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:25 -0800 Subject: x86, efi: Refactor efi_init() a bit Break out some of the init steps into helper functions. Only change to execution flow is the removal of the warning when the kernel memdesc structure differ in size from what firmware specifies since it's a bogus warning (it's a valid difference per spec). v4: * Removed memdesc warning as per above Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-2-git-send-email-olof@lixom.net Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 89 +++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 4cf9bd0a1653..6d88dcac466c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -429,23 +429,8 @@ static void __init efi_free_boot_services(void) } } -void __init efi_init(void) +static void __init efi_systab_init(void *phys) { - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - int i = 0; - void *tmp; - -#ifdef CONFIG_X86_32 - efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; -#else - efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); -#endif - efi.systab = early_ioremap((unsigned long)efi_phys.systab, sizeof(efi_system_table_t)); if (efi.systab == NULL) @@ -464,22 +449,12 @@ void __init efi_init(void) "%d.%02d, expected 1.00 or greater!\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); +} - /* - * Show what we know for posterity - */ - c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - early_iounmap(tmp, 2); - - printk(KERN_INFO "EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); +static void __init efi_config_init(u64 tables, int nr_tables) +{ + efi_config_table_t *config_tables; + int i; /* * Let's see what config tables the firmware passed to us. @@ -526,6 +501,11 @@ void __init efi_init(void) printk("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sizeof(efi_config_table_t)); +} + +static void __init efi_runtime_init(void) +{ + efi_runtime_services_t *runtime; /* * Check out the runtime services table. We need to map @@ -554,7 +534,10 @@ void __init efi_init(void) printk(KERN_ERR "Could not map the EFI runtime service " "table!\n"); early_iounmap(runtime, sizeof(efi_runtime_services_t)); +} +static void __init efi_memmap_init(void) +{ /* Map the EFI memory map */ memmap.map = early_ioremap((unsigned long)memmap.phys_map, memmap.nr_map * memmap.desc_size); @@ -562,12 +545,48 @@ void __init efi_init(void) printk(KERN_ERR "Could not map the EFI memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING - "Kernel-defined memdesc doesn't match the one from EFI!\n"); - if (add_efi_memmap) do_add_efi_memmap(); +} + +void __init efi_init(void) +{ + efi_char16_t *c16; + char vendor[100] = "unknown"; + int i = 0; + void *tmp; + +#ifdef CONFIG_X86_32 + efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; +#else + efi_phys.systab = (efi_system_table_t *) + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); +#endif + + efi_systab_init(efi_phys.systab); + + /* + * Show what we know for posterity + */ + c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + early_iounmap(tmp, 2); + + printk(KERN_INFO "EFI v%u.%.02u by %s\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + efi_config_init(efi.systab->tables, efi.systab->nr_tables); + + efi_runtime_init(); + + efi_memmap_init(); #ifdef CONFIG_X86_32 x86_platform.get_wallclock = efi_get_time; -- cgit v1.2.3 From e3cb3f5a35997906f9b79bf860029c02a54cfae6 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:26 -0800 Subject: x86, efi: Convert printk to pr_*() Alright, I guess I'll go through and convert them, even though there's no net gain to speak of. v4: * Switched to pr_fmt and removed some redundant use of "EFI" in messages. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-3-git-send-email-olof@lixom.net Cc: Joe Perches Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 58 ++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 6d88dcac466c..511fb15e2036 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -26,6 +26,8 @@ * Skip non-WB memory and ignore empty memory ranges. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -47,7 +49,6 @@ #include #define EFI_DEBUG 1 -#define PFX "EFI: " int efi_enabled; EXPORT_SYMBOL(efi_enabled); @@ -254,7 +255,7 @@ int efi_set_rtc_mmss(unsigned long nowtime) status = efi.get_time(&eft, &cap); if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't read time!\n"); + pr_err("Oops: efitime: can't read time!\n"); return -1; } @@ -268,7 +269,7 @@ int efi_set_rtc_mmss(unsigned long nowtime) status = efi.set_time(&eft); if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't write time!\n"); + pr_err("Oops: efitime: can't write time!\n"); return -1; } return 0; @@ -282,7 +283,7 @@ unsigned long efi_get_time(void) status = efi.get_time(&eft, &cap); if (status != EFI_SUCCESS) - printk(KERN_ERR "Oops: efitime: can't read time!\n"); + pr_err("Oops: efitime: can't read time!\n"); return mktime(eft.year, eft.month, eft.day, eft.hour, eft.minute, eft.second); @@ -367,7 +368,7 @@ static void __init print_efi_memmap(void) p < memmap.map_end; p += memmap.desc_size, i++) { md = p; - printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " + pr_info("mem%02u: type=%u, attr=0x%llx, " "range=[0x%016llx-0x%016llx) (%lluMB)\n", i, md->type, md->attribute, md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), @@ -400,7 +401,7 @@ void __init efi_reserve_boot_services(void) memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ md->num_pages = 0; - memblock_dbg(PFX "Could not reserve boot range " + memblock_dbg("Could not reserve boot range " "[0x%010llx-0x%010llx]\n", start, start+size-1); } else @@ -434,7 +435,7 @@ static void __init efi_systab_init(void *phys) efi.systab = early_ioremap((unsigned long)efi_phys.systab, sizeof(efi_system_table_t)); if (efi.systab == NULL) - printk(KERN_ERR "Couldn't map the EFI system table!\n"); + pr_err("Couldn't map the system table!\n"); memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); early_iounmap(efi.systab, sizeof(efi_system_table_t)); efi.systab = &efi_systab; @@ -443,9 +444,9 @@ static void __init efi_systab_init(void *phys) * Verify the EFI Table */ if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR "EFI system table signature incorrect!\n"); + pr_err("System table signature incorrect!\n"); if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR "Warning: EFI system table version " + pr_err("Warning: System table version " "%d.%02d, expected 1.00 or greater!\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); @@ -463,42 +464,42 @@ static void __init efi_config_init(u64 tables, int nr_tables) efi.systab->tables, efi.systab->nr_tables * sizeof(efi_config_table_t)); if (config_tables == NULL) - printk(KERN_ERR "Could not map EFI Configuration Table!\n"); + pr_err("Could not map Configuration table!\n"); - printk(KERN_INFO); + pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { efi.mps = config_tables[i].table; - printk(" MPS=0x%lx ", config_tables[i].table); + pr_cont(" MPS=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID)) { efi.acpi20 = config_tables[i].table; - printk(" ACPI 2.0=0x%lx ", config_tables[i].table); + pr_cont(" ACPI 2.0=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID)) { efi.acpi = config_tables[i].table; - printk(" ACPI=0x%lx ", config_tables[i].table); + pr_cont(" ACPI=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID)) { efi.smbios = config_tables[i].table; - printk(" SMBIOS=0x%lx ", config_tables[i].table); + pr_cont(" SMBIOS=0x%lx ", config_tables[i].table); #ifdef CONFIG_X86_UV } else if (!efi_guidcmp(config_tables[i].guid, UV_SYSTEM_TABLE_GUID)) { efi.uv_systab = config_tables[i].table; - printk(" UVsystab=0x%lx ", config_tables[i].table); + pr_cont(" UVsystab=0x%lx ", config_tables[i].table); #endif } else if (!efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID)) { efi.hcdp = config_tables[i].table; - printk(" HCDP=0x%lx ", config_tables[i].table); + pr_cont(" HCDP=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID)) { efi.uga = config_tables[i].table; - printk(" UGA=0x%lx ", config_tables[i].table); + pr_cont(" UGA=0x%lx ", config_tables[i].table); } } - printk("\n"); + pr_cont("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sizeof(efi_config_table_t)); } @@ -531,8 +532,7 @@ static void __init efi_runtime_init(void) */ efi.get_time = phys_efi_get_time; } else - printk(KERN_ERR "Could not map the EFI runtime service " - "table!\n"); + pr_err("Could not map the runtime service table!\n"); early_iounmap(runtime, sizeof(efi_runtime_services_t)); } @@ -542,7 +542,7 @@ static void __init efi_memmap_init(void) memmap.map = early_ioremap((unsigned long)memmap.phys_map, memmap.nr_map * memmap.desc_size); if (memmap.map == NULL) - printk(KERN_ERR "Could not map the EFI memory map!\n"); + pr_err("Could not map the memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); if (add_efi_memmap) @@ -575,12 +575,12 @@ void __init efi_init(void) vendor[i] = *c16++; vendor[i] = '\0'; } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + pr_err("Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); + pr_info("EFI v%u.%.02u by %s\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); efi_config_init(efi.systab->tables, efi.systab->nr_tables); @@ -696,7 +696,7 @@ void __init efi_enter_virtual_mode(void) md->virt_addr = (u64) (unsigned long) va; if (!va) { - printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", + pr_err("ioremap of 0x%llX failed!\n", (unsigned long long)md->phys_addr); continue; } @@ -730,8 +730,8 @@ void __init efi_enter_virtual_mode(void) (efi_memory_desc_t *)__pa(new_memmap)); if (status != EFI_SUCCESS) { - printk(KERN_ALERT "Unable to switch EFI into virtual mode " - "(status=%lx)!\n", status); + pr_alert("Unable to switch EFI into virtual mode " + "(status=%lx)!\n", status); panic("EFI call to SetVirtualAddressMap() failed!"); } -- cgit v1.2.3 From a6a46f415dca828a04a435ca1f67de0bc5b9ae30 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:27 -0800 Subject: x86, efi: Cleanup config table walking Trivial cleanup, move guid and table pointers to local copies to make the code cleaner. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-4-git-send-email-olof@lixom.net Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 61 +++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 511fb15e2036..03259d1df14f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -455,53 +455,48 @@ static void __init efi_systab_init(void *phys) static void __init efi_config_init(u64 tables, int nr_tables) { efi_config_table_t *config_tables; - int i; + int i, sz = sizeof(efi_config_table_t); /* * Let's see what config tables the firmware passed to us. */ - config_tables = early_ioremap( - efi.systab->tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); + config_tables = early_ioremap(efi.systab->tables, + efi.systab->nr_tables * sz); if (config_tables == NULL) pr_err("Could not map Configuration table!\n"); pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { - if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { - efi.mps = config_tables[i].table; - pr_cont(" MPS=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_20_TABLE_GUID)) { - efi.acpi20 = config_tables[i].table; - pr_cont(" ACPI 2.0=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_TABLE_GUID)) { - efi.acpi = config_tables[i].table; - pr_cont(" ACPI=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - SMBIOS_TABLE_GUID)) { - efi.smbios = config_tables[i].table; - pr_cont(" SMBIOS=0x%lx ", config_tables[i].table); + efi_guid_t guid = config_tables[i].guid; + unsigned long table = config_tables[i].table; + + if (!efi_guidcmp(guid, MPS_TABLE_GUID)) { + efi.mps = table; + pr_cont(" MPS=0x%lx ", table); + } else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) { + efi.acpi20 = table; + pr_cont(" ACPI 2.0=0x%lx ", table); + } else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) { + efi.acpi = table; + pr_cont(" ACPI=0x%lx ", table); + } else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) { + efi.smbios = table; + pr_cont(" SMBIOS=0x%lx ", table); #ifdef CONFIG_X86_UV - } else if (!efi_guidcmp(config_tables[i].guid, - UV_SYSTEM_TABLE_GUID)) { - efi.uv_systab = config_tables[i].table; - pr_cont(" UVsystab=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(guid, UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = table; + pr_cont(" UVsystab=0x%lx ", table); #endif - } else if (!efi_guidcmp(config_tables[i].guid, - HCDP_TABLE_GUID)) { - efi.hcdp = config_tables[i].table; - pr_cont(" HCDP=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - UGA_IO_PROTOCOL_GUID)) { - efi.uga = config_tables[i].table; - pr_cont(" UGA=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) { + efi.hcdp = table; + pr_cont(" HCDP=0x%lx ", table); + } else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) { + efi.uga = table; + pr_cont(" UGA=0x%lx ", table); } } pr_cont("\n"); - early_iounmap(config_tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); + early_iounmap(config_tables, efi.systab->nr_tables * sz); } static void __init efi_runtime_init(void) -- cgit v1.2.3 From 140bf275d3e89e9b36851d5cf498dbbbecdf7ca8 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:28 -0800 Subject: x86, efi: Add basic error handling It's not perfect, but way better than before. Mark efi_enabled as false in case of error and at least stop dereferencing pointers that are known to be invalid. The only significant missing piece is the lack of undoing the memblock_reserve of the memory that efi marks as in use. On the other hand, it's not a large amount of memory, and leaving it unavailable for system use should be the safer choice anyway. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-5-git-send-email-olof@lixom.net Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 85 ++++++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 03259d1df14f..5a053e7737b7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -430,12 +430,14 @@ static void __init efi_free_boot_services(void) } } -static void __init efi_systab_init(void *phys) +static int __init efi_systab_init(void *phys) { efi.systab = early_ioremap((unsigned long)efi_phys.systab, sizeof(efi_system_table_t)); - if (efi.systab == NULL) + if (efi.systab == NULL) { pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); early_iounmap(efi.systab, sizeof(efi_system_table_t)); efi.systab = &efi_systab; @@ -443,16 +445,20 @@ static void __init efi_systab_init(void *phys) /* * Verify the EFI Table */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { pr_err("System table signature incorrect!\n"); + return -EINVAL; + } if ((efi.systab->hdr.revision >> 16) == 0) pr_err("Warning: System table version " "%d.%02d, expected 1.00 or greater!\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); + + return 0; } -static void __init efi_config_init(u64 tables, int nr_tables) +static int __init efi_config_init(u64 tables, int nr_tables) { efi_config_table_t *config_tables; int i, sz = sizeof(efi_config_table_t); @@ -462,8 +468,10 @@ static void __init efi_config_init(u64 tables, int nr_tables) */ config_tables = early_ioremap(efi.systab->tables, efi.systab->nr_tables * sz); - if (config_tables == NULL) + if (config_tables == NULL) { pr_err("Could not map Configuration table!\n"); + return -ENOMEM; + } pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { @@ -497,9 +505,11 @@ static void __init efi_config_init(u64 tables, int nr_tables) } pr_cont("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sz); + + return 0; } -static void __init efi_runtime_init(void) +static int __init efi_runtime_init(void) { efi_runtime_services_t *runtime; @@ -511,37 +521,44 @@ static void __init efi_runtime_init(void) */ runtime = early_ioremap((unsigned long)efi.systab->runtime, sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *)runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - /* - * Make efi_get_time can be called before entering - * virtual mode. - */ - efi.get_time = phys_efi_get_time; - } else + if (!runtime) { pr_err("Could not map the runtime service table!\n"); + return -ENOMEM; + } + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); + + return 0; } -static void __init efi_memmap_init(void) +static int __init efi_memmap_init(void) { /* Map the EFI memory map */ memmap.map = early_ioremap((unsigned long)memmap.phys_map, memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) + if (memmap.map == NULL) { pr_err("Could not map the memory map!\n"); + return -ENOMEM; + } memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); if (add_efi_memmap) do_add_efi_memmap(); + + return 0; } void __init efi_init(void) @@ -559,7 +576,10 @@ void __init efi_init(void) ((__u64)boot_params.efi_info.efi_systab_hi<<32)); #endif - efi_systab_init(efi_phys.systab); + if (efi_systab_init(efi_phys.systab)) { + efi_enabled = 0; + return; + } /* * Show what we know for posterity @@ -577,11 +597,20 @@ void __init efi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - efi_config_init(efi.systab->tables, efi.systab->nr_tables); + if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) { + efi_enabled = 0; + return; + } - efi_runtime_init(); + if (efi_runtime_init()) { + efi_enabled = 0; + return; + } - efi_memmap_init(); + if (efi_memmap_init()) { + efi_enabled = 0; + return; + } #ifdef CONFIG_X86_32 x86_platform.get_wallclock = efi_get_time; -- cgit v1.2.3 From 1adbfa3511ee1c1118e16a9a0246870f12fef4e6 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:29 -0800 Subject: x86, efi: Allow basic init with mixed 32/64-bit efi/kernel Traditionally the kernel has refused to setup EFI at all if there's been a mismatch in 32/64-bit mode between EFI and the kernel. On some platforms that boot natively through EFI (Chrome OS being one), we still need to get at least some of the static data such as memory configuration out of EFI. Runtime services aren't as critical, and it's a significant amount of work to implement switching between the operating modes to call between kernel and firmware for thise cases. So I'm ignoring it for now. v5: * Fixed some printk strings based on feedback * Renamed 32/64-bit specific types to not have _ prefix * Fixed bug in printout of efi runtime disablement v4: * Some of the earlier cleanup was accidentally reverted by this patch, fixed. * Reworded some messages to not have to line wrap printk strings v3: * Reorganized to a series of patches to make it easier to review, and do some of the cleanups I had left out before. v2: * Added graceful error handling for 32-bit kernel that gets passed EFI data above 4GB. * Removed some warnings that were missed in first version. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-6-git-send-email-olof@lixom.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 2 +- arch/x86/kernel/setup.c | 10 ++- arch/x86/platform/efi/efi.c | 164 ++++++++++++++++++++++++++++++++++++++------ include/linux/efi.h | 45 ++++++++++++ 4 files changed, 196 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 844f735fd63a..c9dcc181d4d1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -95,7 +95,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern void efi_memblock_x86_reserve_range(void); +extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe874..88638883176a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI_LOADER_SIGNATURE, 4)) { + "EL32", 4)) { efi_enabled = 1; - efi_memblock_x86_reserve_range(); + efi_64bit = false; + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) { + efi_enabled = 1; + efi_64bit = true; } + if (efi_enabled && efi_memblock_x86_reserve_range()) + efi_enabled = 0; #endif x86_init.oem.arch_setup(); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5a053e7737b7..92660edaa1e7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -68,6 +68,9 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; +bool efi_64bit; +static bool efi_native; + static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -339,11 +342,16 @@ static void __init do_add_efi_memmap(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } -void __init efi_memblock_x86_reserve_range(void) +int __init efi_memblock_x86_reserve_range(void) { unsigned long pmap; #ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (boot_params.efi_info.efi_memmap_hi) { + pr_err("Memory map is above 4GB, disabling EFI.\n"); + return -EINVAL; + } pmap = boot_params.efi_info.efi_memmap; #else pmap = (boot_params.efi_info.efi_memmap | @@ -355,6 +363,8 @@ void __init efi_memblock_x86_reserve_range(void) memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + + return 0; } #if EFI_DEBUG @@ -432,14 +442,75 @@ static void __init efi_free_boot_services(void) static int __init efi_systab_init(void *phys) { - efi.systab = early_ioremap((unsigned long)efi_phys.systab, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_err("Couldn't map the system table!\n"); - return -ENOMEM; + if (efi_64bit) { + efi_system_table_64_t *systab64; + u64 tmp = 0; + + systab64 = early_ioremap((unsigned long)phys, + sizeof(*systab64)); + if (systab64 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab64->hdr; + efi_systab.fw_vendor = systab64->fw_vendor; + tmp |= systab64->fw_vendor; + efi_systab.fw_revision = systab64->fw_revision; + efi_systab.con_in_handle = systab64->con_in_handle; + tmp |= systab64->con_in_handle; + efi_systab.con_in = systab64->con_in; + tmp |= systab64->con_in; + efi_systab.con_out_handle = systab64->con_out_handle; + tmp |= systab64->con_out_handle; + efi_systab.con_out = systab64->con_out; + tmp |= systab64->con_out; + efi_systab.stderr_handle = systab64->stderr_handle; + tmp |= systab64->stderr_handle; + efi_systab.stderr = systab64->stderr; + tmp |= systab64->stderr; + efi_systab.runtime = (void *)(unsigned long)systab64->runtime; + tmp |= systab64->runtime; + efi_systab.boottime = (void *)(unsigned long)systab64->boottime; + tmp |= systab64->boottime; + efi_systab.nr_tables = systab64->nr_tables; + efi_systab.tables = systab64->tables; + tmp |= systab64->tables; + + early_iounmap(systab64, sizeof(*systab64)); +#ifdef CONFIG_X86_32 + if (tmp >> 32) { + pr_err("EFI data located above 4GB, disabling EFI.\n"); + return -EINVAL; + } +#endif + } else { + efi_system_table_32_t *systab32; + + systab32 = early_ioremap((unsigned long)phys, + sizeof(*systab32)); + if (systab32 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab32->hdr; + efi_systab.fw_vendor = systab32->fw_vendor; + efi_systab.fw_revision = systab32->fw_revision; + efi_systab.con_in_handle = systab32->con_in_handle; + efi_systab.con_in = systab32->con_in; + efi_systab.con_out_handle = systab32->con_out_handle; + efi_systab.con_out = systab32->con_out; + efi_systab.stderr_handle = systab32->stderr_handle; + efi_systab.stderr = systab32->stderr; + efi_systab.runtime = (void *)(unsigned long)systab32->runtime; + efi_systab.boottime = (void *)(unsigned long)systab32->boottime; + efi_systab.nr_tables = systab32->nr_tables; + efi_systab.tables = systab32->tables; + + early_iounmap(systab32, sizeof(*systab32)); } - memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); - early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; /* @@ -460,24 +531,47 @@ static int __init efi_systab_init(void *phys) static int __init efi_config_init(u64 tables, int nr_tables) { - efi_config_table_t *config_tables; - int i, sz = sizeof(efi_config_table_t); + void *config_tables, *tablep; + int i, sz; + + if (efi_64bit) + sz = sizeof(efi_config_table_64_t); + else + sz = sizeof(efi_config_table_32_t); /* * Let's see what config tables the firmware passed to us. */ - config_tables = early_ioremap(efi.systab->tables, - efi.systab->nr_tables * sz); + config_tables = early_ioremap(tables, nr_tables * sz); if (config_tables == NULL) { pr_err("Could not map Configuration table!\n"); return -ENOMEM; } + tablep = config_tables; pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid = config_tables[i].guid; - unsigned long table = config_tables[i].table; - + efi_guid_t guid; + unsigned long table; + + if (efi_64bit) { + u64 table64; + guid = ((efi_config_table_64_t *)tablep)->guid; + table64 = ((efi_config_table_64_t *)tablep)->table; + table = table64; +#ifdef CONFIG_X86_32 + if (table64 >> 32) { + pr_cont("\n"); + pr_err("Table located above 4GB, disabling EFI.\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sz); + return -EINVAL; + } +#endif + } else { + guid = ((efi_config_table_32_t *)tablep)->guid; + table = ((efi_config_table_32_t *)tablep)->table; + } if (!efi_guidcmp(guid, MPS_TABLE_GUID)) { efi.mps = table; pr_cont(" MPS=0x%lx ", table); @@ -502,10 +596,10 @@ static int __init efi_config_init(u64 tables, int nr_tables) efi.uga = table; pr_cont(" UGA=0x%lx ", table); } + tablep += sz; } pr_cont("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sz); - return 0; } @@ -569,11 +663,19 @@ void __init efi_init(void) void *tmp; #ifdef CONFIG_X86_32 + if (boot_params.efi_info.efi_systab_hi || + boot_params.efi_info.efi_memmap_hi) { + pr_info("Table located above 4GB, disabling EFI.\n"); + efi_enabled = 0; + return; + } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -602,7 +704,14 @@ void __init efi_init(void) return; } - if (efi_runtime_init()) { + /* + * Note: We currently don't support runtime services on an EFI + * that doesn't match the kernel 32/64-bit mode. + */ + + if (!efi_native) + pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); + else if (efi_runtime_init()) { efi_enabled = 0; return; } @@ -611,10 +720,11 @@ void __init efi_init(void) efi_enabled = 0; return; } - #ifdef CONFIG_X86_32 - x86_platform.get_wallclock = efi_get_time; - x86_platform.set_wallclock = efi_set_rtc_mmss; + if (efi_native) { + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; + } #endif #if EFI_DEBUG @@ -672,6 +782,14 @@ void __init efi_enter_virtual_mode(void) efi.systab = NULL; + /* + * We don't do virtual mode, since we don't do runtime services, on + * non-native EFI + */ + + if (!efi_native) + goto out; + /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { u64 prev_size; @@ -787,6 +905,8 @@ void __init efi_enter_virtual_mode(void) efi.query_capsule_caps = virt_efi_query_capsule_caps; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); + +out: early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; kfree(new_memmap); diff --git a/include/linux/efi.h b/include/linux/efi.h index 37c300712e02..47fbf6b3dc77 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -313,6 +313,16 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules, #define EFI_FILE_SYSTEM_GUID \ EFI_GUID( 0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + typedef struct { efi_guid_t guid; unsigned long table; @@ -327,6 +337,40 @@ typedef struct { #define EFI_1_10_SYSTEM_TABLE_REVISION ((1 << 16) | (10)) #define EFI_1_02_SYSTEM_TABLE_REVISION ((1 << 16) | (02)) +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + typedef struct { efi_table_hdr_t hdr; unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ @@ -497,6 +541,7 @@ extern int __init efi_setup_pcdp_console(char *); #ifdef CONFIG_EFI # ifdef CONFIG_X86 extern int efi_enabled; + extern bool efi_64bit; # else # define efi_enabled 1 # endif -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..5b448a74d0f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd7..b149b88ea795 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c55..1b22f6de2932 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca45..4d6d77ed9b9d 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f1..ae098c438f00 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b67..6c32190dc73e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f8..5080d16a832f 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ead..3a16c1483b45 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..c0180fd372d2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..ea7b4fd34676 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..dcde2d9268cd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6b..5e0f8bb89b2b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd2..bf9dcadbb53a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..112c6824476b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..423547ada38a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..b4cd6d8ea150 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..da7ce7f0e566 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d7..495586232aa1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..3a4e5817a2a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9cd..0c2850874254 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..602fb305365f 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b301..e1b7e051332e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3 From 626109130267713cac020515504ec341e47c96f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 14:54:37 +0000 Subject: x86-64: Fix CFI annotations for NMI nesting code The saving and restoring of %rdx wasn't annotated at all, and the jumping over sections where state gets partly restored wasn't handled either. Further, by folding the pushing of the previous frame in repeat_nmi into that which so far was immediately preceding restart_nmi (after moving the restore of %rdx ahead of that, since it doesn't get used anymore when pushing prior frames), annotations of the replicated frame creations can be made consistent too. v2: Fully fold repeat_nmi into the normal code flow (adding a single redundant instruction to the "normal" code path), thus retaining the special protection of all instructions between repeat_nmi and end_repeat_nmi. Link: http://lkml.kernel.org/r/4F478B630200007800074A31@nat28.tlf.novell.com Signed-off-by: Jan Beulich Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 52 +++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1333d9851778..e0eca007dc0d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1530,6 +1530,7 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1554,6 +1555,7 @@ ENTRY(nmi) */ lea 6*8(%rsp), %rdx test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + CFI_REMEMBER_STATE nested_nmi: /* @@ -1585,10 +1587,12 @@ nested_nmi: nested_nmi_out: popq_cfi %rdx + CFI_RESTORE rdx /* No need to check faults here */ INTERRUPT_RETURN + CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1624,6 +1628,10 @@ first_nmi: * NMI may zero out. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ + /* Do not pop rdx, nested NMIs will corrupt it */ + movq (%rsp), %rdx + CFI_RESTORE rdx + /* Set the NMI executing variable on the stack. */ pushq_cfi $1 @@ -1631,14 +1639,31 @@ first_nmi: .rept 5 pushq_cfi 6*8(%rsp) .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 5*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ .rept 5 pushq_cfi 4*8(%rsp) .endr - - /* Do not pop rdx, nested NMIs will corrupt it */ - movq 11*8(%rsp), %rdx + CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: /* * Everything below this point can be preempted by a nested @@ -1646,7 +1671,6 @@ first_nmi: * caused by an exception and nested NMI will start here, and * can still be preempted by another NMI. */ -restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1675,26 +1699,6 @@ nmi_restore: CFI_ENDPROC END(nmi) - /* - * If an NMI hit an iret because of an exception or breakpoint, - * it can lose its NMI context, and a nested NMI may come in. - * In that case, the nested NMI will change the preempted NMI's - * stack to jump to here when it does the final iret. - */ -repeat_nmi: - INTR_FRAME - /* Update the stack variable to say we are still in NMI */ - movq $1, 5*8(%rsp) - - /* copy the saved stack back to copy stack */ - .rept 5 - pushq_cfi 4*8(%rsp) - .endr - - jmp restart_nmi - CFI_ENDPROC -end_repeat_nmi: - ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v1.2.3 From 69466466ce889cd2cbc8cda9ff1c6083f48cc7f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:55:01 +0000 Subject: x86-64: Improve insn scheduling in SAVE_ARGS_IRQ In one case, use an address register that was computed earlier (and with a simpler instruction), thus reducing the risk of a stall. In the second case, eliminate a branch by using a conditional move (as is already done in call_softirq and xen_do_hypervisor_callback). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4788A50200007800074A26@nat28.tlf.novell.com Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc87..211b2e1683f1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -319,7 +319,7 @@ ENDPROC(native_usergs_sysret64) movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS(%rdi) + testl $3, CS-RBP(%rsi) je 1f SWAPGS /* @@ -329,11 +329,10 @@ ENDPROC(native_usergs_sysret64) * moving irq_enter into assembly, which would be too much work) */ 1: incl PER_CPU_VAR(irq_count) - jne 2f - mov PER_CPU_VAR(irq_stack_ptr),%rsp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi -2: /* Store previous stack value */ + /* Store previous stack value */ pushq %rsi CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 0x77 /* DW_OP_breg7 */, 0, \ -- cgit v1.2.3 From 79fb4ad63e8266ffac1f69bbb45a6f86570493e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Feb 2012 15:55:13 -0500 Subject: x86: Fix the NMI nesting comments Some of the comments for the nesting NMI algorithm were stale and had some references to some prototypes that were first tried. I also updated the comments to be a little easier to understand the flow of the code. It definitely needs the documentation. Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e0eca007dc0d..2de3e457bd4b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1624,11 +1624,12 @@ first_nmi: * | pt_regs | * +-------------------------+ * - * The saved RIP is used to fix up the copied RIP that a nested - * NMI may zero out. The original stack frame and the temp storage + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ - /* Do not pop rdx, nested NMIs will corrupt it */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx CFI_RESTORE rdx @@ -1641,6 +1642,8 @@ first_nmi: .endr CFI_DEF_CFA_OFFSET SS+8-RIP + /* Everything up to here is safe from nested NMIs */ + /* * If there was a nested NMI, the first NMI's iret will return * here. But NMIs are still enabled and we can take another @@ -1667,9 +1670,8 @@ end_repeat_nmi: /* * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception. Repeated NMIs - * caused by an exception and nested NMI will start here, and - * can still be preempted by another NMI. + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp -- cgit v1.2.3 From c484b2418b0b5bb7b16f01343330650faee60df2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 23 Feb 2012 23:46:50 -0800 Subject: PCI: Use class for quirk for via_no_dac Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21ea..28e5e06fcba4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); #endif -- cgit v1.2.3 From 4082cf2d7be958bcb5f98ea3b47ef3c9ef8d97e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 23 Feb 2012 23:46:51 -0800 Subject: PCI: Use class quirk for intel fix_transparent_bridge Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/fixup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6dd89555fbfa..24172ffd795b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -164,11 +164,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_ */ static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && - (dev->device & 0xff00) == 0x2400) + if ((dev->device & 0xff00) == 0x2400) dev->transparent = 1; } -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge); /* * Fixup for C1 Halt Disconnect problem on nForce2 systems. -- cgit v1.2.3 From 73e3b590f38fb7c03ee370430348edf1f401204e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 23 Feb 2012 23:46:52 -0800 Subject: PCI: Use class for quirk for pci_fixup_video Signed-off-by: Yinghai Lu Signed-off-by: Jesse Barnes --- arch/x86/pci/fixup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 24172ffd795b..d0e6e403b4f6 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -322,9 +322,6 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) struct pci_bus *bus; u16 config; - if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) - return; - /* Is VGA routed to us? */ bus = pdev->bus; while (bus) { @@ -353,7 +350,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); } } -DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { -- cgit v1.2.3 From 35474c3bb712261c285ca20c568e4e508387cad5 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:48:37 +0200 Subject: crypto: serpent-sse2 - use crypto_[un]register_algs Combine all crypto_alg to be registered and use new crypto_[un]register_algs functions. Simplifies init/exit code and reduce object size. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 406 +++++++++++++++--------------------- 1 file changed, 163 insertions(+), 243 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index de81cf4e06a1..5520c7522200 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -145,28 +145,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ecb_crypt(desc, &walk, false); } -static struct crypto_alg blk_ecb_alg = { - .cra_name = "__ecb-serpent-sse2", - .cra_driver_name = "__driver-ecb-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}; - static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk) { @@ -295,28 +273,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_cbc_alg = { - .cra_name = "__cbc-serpent-sse2", - .cra_driver_name = "__driver-cbc-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .setkey = serpent_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}; - static inline void u128_to_be128(be128 *dst, const u128 *src) { dst->a = cpu_to_be64(src->a); @@ -439,29 +395,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_ctr_alg = { - .cra_name = "__ctr-serpent-sse2", - .cra_driver_name = "__driver-ctr-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct serpent_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = serpent_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}; - struct crypt_priv { struct serpent_ctx *ctx; bool fpu_enabled; @@ -580,32 +513,6 @@ static void lrw_exit_tfm(struct crypto_tfm *tfm) lrw_free_table(&ctx->lrw_table); } -static struct crypto_alg blk_lrw_alg = { - .cra_name = "__lrw-serpent-sse2", - .cra_driver_name = "__driver-lrw-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), - .cra_exit = lrw_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .max_keysize = SERPENT_MAX_KEY_SIZE + - SERPENT_BLOCK_SIZE, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = lrw_serpent_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}; - struct serpent_xts_ctx { struct serpent_ctx tweak_ctx; struct serpent_ctx crypt_ctx; @@ -689,29 +596,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static struct crypto_alg blk_xts_alg = { - .cra_name = "__xts-serpent-sse2", - .cra_driver_name = "__driver-xts-serpent-sse2", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct serpent_xts_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = SERPENT_MIN_KEY_SIZE * 2, - .max_keysize = SERPENT_MAX_KEY_SIZE * 2, - .ivsize = SERPENT_BLOCK_SIZE, - .setkey = xts_serpent_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, - }, - }, -}; - static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -813,7 +697,157 @@ static int ablk_ecb_init(struct crypto_tfm *tfm) return 0; } -static struct crypto_alg ablk_ecb_alg = { +static int ablk_cbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg serpent_algs[10] = { { + .cra_name = "__ecb-serpent-sse2", + .cra_driver_name = "__driver-ecb-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-serpent-sse2", + .cra_driver_name = "__driver-cbc-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-serpent-sse2", + .cra_driver_name = "__driver-ctr-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-serpent-sse2", + .cra_driver_name = "__driver-lrw-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-serpent-sse2", + .cra_driver_name = "__driver-xts-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { .cra_name = "ecb(serpent)", .cra_driver_name = "ecb-serpent-sse2", .cra_priority = 400, @@ -823,7 +857,7 @@ static struct crypto_alg ablk_ecb_alg = { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list), + .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), .cra_init = ablk_ecb_init, .cra_exit = ablk_exit, .cra_u = { @@ -835,20 +869,7 @@ static struct crypto_alg ablk_ecb_alg = { .decrypt = ablk_decrypt, }, }, -}; - -static int ablk_cbc_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_cbc_alg = { +}, { .cra_name = "cbc(serpent)", .cra_driver_name = "cbc-serpent-sse2", .cra_priority = 400, @@ -858,7 +879,7 @@ static struct crypto_alg ablk_cbc_alg = { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list), + .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), .cra_init = ablk_cbc_init, .cra_exit = ablk_exit, .cra_u = { @@ -871,20 +892,7 @@ static struct crypto_alg ablk_cbc_alg = { .decrypt = ablk_decrypt, }, }, -}; - -static int ablk_ctr_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_ctr_alg = { +}, { .cra_name = "ctr(serpent)", .cra_driver_name = "ctr-serpent-sse2", .cra_priority = 400, @@ -894,7 +902,7 @@ static struct crypto_alg ablk_ctr_alg = { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), .cra_init = ablk_ctr_init, .cra_exit = ablk_exit, .cra_u = { @@ -908,20 +916,7 @@ static struct crypto_alg ablk_ctr_alg = { .geniv = "chainiv", }, }, -}; - -static int ablk_lrw_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_lrw_alg = { +}, { .cra_name = "lrw(serpent)", .cra_driver_name = "lrw-serpent-sse2", .cra_priority = 400, @@ -931,7 +926,7 @@ static struct crypto_alg ablk_lrw_alg = { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), .cra_init = ablk_lrw_init, .cra_exit = ablk_exit, .cra_u = { @@ -946,20 +941,7 @@ static struct crypto_alg ablk_lrw_alg = { .decrypt = ablk_decrypt, }, }, -}; - -static int ablk_xts_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_xts_alg = { +}, { .cra_name = "xts(serpent)", .cra_driver_name = "xts-serpent-sse2", .cra_priority = 400, @@ -969,7 +951,7 @@ static struct crypto_alg ablk_xts_alg = { .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), .cra_init = ablk_xts_init, .cra_exit = ablk_exit, .cra_u = { @@ -982,83 +964,21 @@ static struct crypto_alg ablk_xts_alg = { .decrypt = ablk_decrypt, }, }, -}; +} }; static int __init serpent_sse2_init(void) { - int err; - if (!cpu_has_xmm2) { printk(KERN_INFO "SSE2 instructions are not detected.\n"); return -ENODEV; } - err = crypto_register_alg(&blk_ecb_alg); - if (err) - goto blk_ecb_err; - err = crypto_register_alg(&blk_cbc_alg); - if (err) - goto blk_cbc_err; - err = crypto_register_alg(&blk_ctr_alg); - if (err) - goto blk_ctr_err; - err = crypto_register_alg(&ablk_ecb_alg); - if (err) - goto ablk_ecb_err; - err = crypto_register_alg(&ablk_cbc_alg); - if (err) - goto ablk_cbc_err; - err = crypto_register_alg(&ablk_ctr_alg); - if (err) - goto ablk_ctr_err; - err = crypto_register_alg(&blk_lrw_alg); - if (err) - goto blk_lrw_err; - err = crypto_register_alg(&ablk_lrw_alg); - if (err) - goto ablk_lrw_err; - err = crypto_register_alg(&blk_xts_alg); - if (err) - goto blk_xts_err; - err = crypto_register_alg(&ablk_xts_alg); - if (err) - goto ablk_xts_err; - return err; - -ablk_xts_err: - crypto_unregister_alg(&blk_xts_alg); -blk_xts_err: - crypto_unregister_alg(&ablk_lrw_alg); -ablk_lrw_err: - crypto_unregister_alg(&blk_lrw_alg); -blk_lrw_err: - crypto_unregister_alg(&ablk_ctr_alg); -ablk_ctr_err: - crypto_unregister_alg(&ablk_cbc_alg); -ablk_cbc_err: - crypto_unregister_alg(&ablk_ecb_alg); -ablk_ecb_err: - crypto_unregister_alg(&blk_ctr_alg); -blk_ctr_err: - crypto_unregister_alg(&blk_cbc_alg); -blk_cbc_err: - crypto_unregister_alg(&blk_ecb_alg); -blk_ecb_err: - return err; + return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); } static void __exit serpent_sse2_exit(void) { - crypto_unregister_alg(&ablk_xts_alg); - crypto_unregister_alg(&blk_xts_alg); - crypto_unregister_alg(&ablk_lrw_alg); - crypto_unregister_alg(&blk_lrw_alg); - crypto_unregister_alg(&ablk_ctr_alg); - crypto_unregister_alg(&ablk_cbc_alg); - crypto_unregister_alg(&ablk_ecb_alg); - crypto_unregister_alg(&blk_ctr_alg); - crypto_unregister_alg(&blk_cbc_alg); - crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_sse2_init); -- cgit v1.2.3 From 53709ddee36cbd19434aa0f0ac8c1e27b92aca33 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:48:43 +0200 Subject: crypto: twofish-x86_64-3way - use crypto_[un]register_algs Combine all crypto_alg to be registered and use new crypto_[un]register_algs functions. Simplifies init/exit code and reduce object size. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 219 +++++++++++++++--------------------- 1 file changed, 89 insertions(+), 130 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 2c7f14ec7082..408fc0c5814e 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -123,28 +123,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); } -static struct crypto_alg blk_ecb_alg = { - .cra_name = "ecb(twofish)", - .cra_driver_name = "ecb-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .setkey = twofish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}; - static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk) { @@ -268,29 +246,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_cbc_alg = { - .cra_name = "cbc(twofish)", - .cra_driver_name = "cbc-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}; - static inline void u128_to_be128(be128 *dst, const u128 *src) { dst->a = cpu_to_be64(src->a); @@ -412,29 +367,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_ctr_alg = { - .cra_name = "ctr(twofish)", - .cra_driver_name = "ctr-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE, - .max_keysize = TF_MAX_KEY_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = twofish_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}; - static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = TF_BLOCK_SIZE; @@ -525,30 +457,6 @@ static void lrw_exit_tfm(struct crypto_tfm *tfm) lrw_free_table(&ctx->lrw_table); } -static struct crypto_alg blk_lrw_alg = { - .cra_name = "lrw(twofish)", - .cra_driver_name = "lrw-twofish-3way", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_lrw_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), - .cra_exit = lrw_exit_tfm, - .cra_u = { - .blkcipher = { - .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, - .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE, - .ivsize = TF_BLOCK_SIZE, - .setkey = lrw_twofish_setkey, - .encrypt = lrw_encrypt, - .decrypt = lrw_decrypt, - }, - }, -}; - struct twofish_xts_ctx { struct twofish_ctx tweak_ctx; struct twofish_ctx crypt_ctx; @@ -615,7 +523,91 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return xts_crypt(desc, dst, src, nbytes, &req); } -static struct crypto_alg blk_xts_alg = { +static struct crypto_alg tf_algs[5] = { { + .cra_name = "ecb(twofish)", + .cra_driver_name = "ecb-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(tf_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(twofish)", + .cra_driver_name = "cbc-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(tf_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ctr(twofish)", + .cra_driver_name = "ctr-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(tf_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "lrw(twofish)", + .cra_driver_name = "lrw-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = lrw_twofish_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { .cra_name = "xts(twofish)", .cra_driver_name = "xts-twofish-3way", .cra_priority = 300, @@ -625,7 +617,7 @@ static struct crypto_alg blk_xts_alg = { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), + .cra_list = LIST_HEAD_INIT(tf_algs[4].cra_list), .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE * 2, @@ -636,7 +628,7 @@ static struct crypto_alg blk_xts_alg = { .decrypt = xts_decrypt, }, }, -}; +} }; static bool is_blacklisted_cpu(void) { @@ -678,8 +670,6 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); int __init init(void) { - int err; - if (!force && is_blacklisted_cpu()) { printk(KERN_INFO "twofish-x86_64-3way: performance on this CPU " @@ -688,43 +678,12 @@ int __init init(void) return -ENODEV; } - err = crypto_register_alg(&blk_ecb_alg); - if (err) - goto ecb_err; - err = crypto_register_alg(&blk_cbc_alg); - if (err) - goto cbc_err; - err = crypto_register_alg(&blk_ctr_alg); - if (err) - goto ctr_err; - err = crypto_register_alg(&blk_lrw_alg); - if (err) - goto blk_lrw_err; - err = crypto_register_alg(&blk_xts_alg); - if (err) - goto blk_xts_err; - - return 0; - -blk_xts_err: - crypto_unregister_alg(&blk_lrw_alg); -blk_lrw_err: - crypto_unregister_alg(&blk_ctr_alg); -ctr_err: - crypto_unregister_alg(&blk_cbc_alg); -cbc_err: - crypto_unregister_alg(&blk_ecb_alg); -ecb_err: - return err; + return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); } void __exit fini(void) { - crypto_unregister_alg(&blk_xts_alg); - crypto_unregister_alg(&blk_lrw_alg); - crypto_unregister_alg(&blk_ctr_alg); - crypto_unregister_alg(&blk_cbc_alg); - crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); } module_init(init); -- cgit v1.2.3 From d433208cfc3db3ae0520da92a15ac1f82d8b61ed Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:48:48 +0200 Subject: crypto: blowfish-x86_64 - use crypto_[un]register_algs Combine all crypto_alg to be registered and use new crypto_[un]register_algs functions. Simplifies init/exit code and reduce object size. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 163 ++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 98 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 2970110d2cea..73bc8a93f0ce 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -77,27 +77,6 @@ static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src); } -static struct crypto_alg bf_alg = { - .cra_name = "blowfish", - .cra_driver_name = "blowfish-asm", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 3, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(bf_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = BF_MIN_KEY_SIZE, - .cia_max_keysize = BF_MAX_KEY_SIZE, - .cia_setkey = blowfish_setkey, - .cia_encrypt = blowfish_encrypt, - .cia_decrypt = blowfish_decrypt, - } - } -}; - static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, void (*fn)(struct bf_ctx *, u8 *, const u8 *), void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *)) @@ -161,28 +140,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way); } -static struct crypto_alg blk_ecb_alg = { - .cra_name = "ecb(blowfish)", - .cra_driver_name = "ecb-blowfish-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .setkey = blowfish_setkey, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}; - static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk) { @@ -308,29 +265,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_cbc_alg = { - .cra_name = "cbc(blowfish)", - .cra_driver_name = "cbc-blowfish-asm", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = BF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = BF_MIN_KEY_SIZE, - .max_keysize = BF_MAX_KEY_SIZE, - .ivsize = BF_BLOCK_SIZE, - .setkey = blowfish_setkey, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}; - static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) { u8 *ctrblk = walk->iv; @@ -424,7 +358,67 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, return err; } -static struct crypto_alg blk_ctr_alg = { +static struct crypto_alg bf_algs[4] = { { + .cra_name = "blowfish", + .cra_driver_name = "blowfish-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(bf_algs[0].cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = BF_MIN_KEY_SIZE, + .cia_max_keysize = BF_MAX_KEY_SIZE, + .cia_setkey = blowfish_setkey, + .cia_encrypt = blowfish_encrypt, + .cia_decrypt = blowfish_decrypt, + } + } +}, { + .cra_name = "ecb(blowfish)", + .cra_driver_name = "ecb-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(bf_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .setkey = blowfish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(blowfish)", + .cra_driver_name = "cbc-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(bf_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { .cra_name = "ctr(blowfish)", .cra_driver_name = "ctr-blowfish-asm", .cra_priority = 300, @@ -434,7 +428,7 @@ static struct crypto_alg blk_ctr_alg = { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_list = LIST_HEAD_INIT(bf_algs[3].cra_list), .cra_u = { .blkcipher = { .min_keysize = BF_MIN_KEY_SIZE, @@ -445,7 +439,7 @@ static struct crypto_alg blk_ctr_alg = { .decrypt = ctr_crypt, }, }, -}; +} }; static bool is_blacklisted_cpu(void) { @@ -470,8 +464,6 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init init(void) { - int err; - if (!force && is_blacklisted_cpu()) { printk(KERN_INFO "blowfish-x86_64: performance on this CPU " @@ -480,37 +472,12 @@ static int __init init(void) return -ENODEV; } - err = crypto_register_alg(&bf_alg); - if (err) - goto bf_err; - err = crypto_register_alg(&blk_ecb_alg); - if (err) - goto ecb_err; - err = crypto_register_alg(&blk_cbc_alg); - if (err) - goto cbc_err; - err = crypto_register_alg(&blk_ctr_alg); - if (err) - goto ctr_err; - - return 0; - -ctr_err: - crypto_unregister_alg(&blk_cbc_alg); -cbc_err: - crypto_unregister_alg(&blk_ecb_alg); -ecb_err: - crypto_unregister_alg(&bf_alg); -bf_err: - return err; + return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs)); } static void __exit fini(void) { - crypto_unregister_alg(&blk_ctr_alg); - crypto_unregister_alg(&blk_cbc_alg); - crypto_unregister_alg(&blk_ecb_alg); - crypto_unregister_alg(&bf_alg); + crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs)); } module_init(init); -- cgit v1.2.3 From 435d3e51af3de0c1fe9f6ca1a18df3cd4d6b8c17 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:48:53 +0200 Subject: crypto: serpent-sse2 - combine ablk_*_init functions Driver name in ablk_*_init functions can be constructed runtime. Therefore use single function ablk_init to reduce object size. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 71 +++++++------------------------------ 1 file changed, 13 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 5520c7522200..4b21be85e0a1 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -676,68 +676,23 @@ static void ablk_exit(struct crypto_tfm *tfm) cryptd_free_ablkcipher(ctx->cryptd_tfm); } -static void ablk_init_common(struct crypto_tfm *tfm, - struct cryptd_ablkcipher *cryptd_tfm) +static int ablk_init(struct crypto_tfm *tfm) { struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); -} - -static int ablk_ecb_init(struct crypto_tfm *tfm) -{ struct cryptd_ablkcipher *cryptd_tfm; + char drv_name[CRYPTO_MAX_ALG_NAME]; - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static int ablk_cbc_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0); + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} -static int ablk_ctr_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static int ablk_lrw_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static int ablk_xts_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); return 0; } @@ -858,7 +813,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), - .cra_init = ablk_ecb_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -880,7 +835,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), - .cra_init = ablk_cbc_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -903,7 +858,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), - .cra_init = ablk_ctr_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -927,7 +882,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), - .cra_init = ablk_lrw_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -952,7 +907,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), - .cra_init = ablk_xts_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { -- cgit v1.2.3 From 919e2c32496aec4170bd67e64efd526dd0a9bbdc Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:48:58 +0200 Subject: crypto: blowfish-x86_64 - set alignmask to zero x86 has fast unaligned accesses, so blowfish-x86_64 does not need to enforce alignment. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 73bc8a93f0ce..7967474de8f7 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -365,7 +365,7 @@ static struct crypto_alg bf_algs[4] = { { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = BF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 3, + .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(bf_algs[0].cra_list), .cra_u = { -- cgit v1.2.3 From 894042648902d11d579af2a936a5a9a43cd5f1e4 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 17 Feb 2012 22:49:03 +0200 Subject: crypto: twofish-x86_64/i586 - set alignmask to zero x86 has fast unaligned accesses, so twofish-x86_64/i586 does not need to enforce alignment. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index dc6b3fb817fc..359ae084275c 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -68,7 +68,7 @@ static struct crypto_alg alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 3, + .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(alg.cra_list), .cra_u = { -- cgit v1.2.3 From ce5f7a99df87918b5be4618a9386213a8e9a7146 Mon Sep 17 00:00:00 2001 From: Bobby Powers Date: Sat, 25 Feb 2012 23:25:38 -0500 Subject: x32: Make sure TS_COMPAT is cleared for x32 tasks If a process has a non-x32 ia32 personality and changes to x32, the process would keep its TS_COMPAT flag. x32 uses the presence of the x32 flag on a syscall to determine compat status, so make sure TS_COMPAT is cleared. Signed-off-by: Bobby Powers Link: http://lkml.kernel.org/r/1330230338-25077-1-git-send-email-bobbypowers@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a0701da2bd18..32e04120b2cd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -540,6 +540,9 @@ void set_personality_ia32(bool x32) clear_thread_flag(TIF_IA32); set_thread_flag(TIF_X32); current->personality &= ~READ_IMPLIES_EXEC; + /* is_compat_task() uses the presence of the x32 + syscall bit flag to determine compat status */ + current_thread_info()->status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); -- cgit v1.2.3 From 00194b2e845da29395ad00c13a884d9acb9306b5 Mon Sep 17 00:00:00 2001 From: Bobby Powers Date: Sat, 25 Feb 2012 22:59:34 -0500 Subject: x32: Only clear TIF_X32 flag once Commits bb212724 and d1a797f3 both added a call to clear_thread_flag(TIF_X32) under set_personality_64bit() - only one is needed. Signed-off-by: Bobby Powers Link: http://lkml.kernel.org/r/1330228774-24223-1-git-send-email-bobbypowers@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 32e04120b2cd..a4659739e202 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -510,7 +510,6 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); -- cgit v1.2.3 From 42dfc43ee5999ac64284476ea0ac6c937587cf2b Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Sun, 26 Feb 2012 21:47:55 +0530 Subject: x86_64: Record stack pointer before task execution begins task->thread.usersp is unusable immediately after a binary is exec()'d until it undergoes a context switch cycle. The start_thread() function called during execve() saves the stack pointer into pt_regs and into old_rsp, but fails to record it into task->thread.usersp. Because of this, KSTK_ESP(task) returns an incorrect value for a 64-bit program until the task is switched out and back in since switch_to swaps %rsp values in and out into task->thread.usersp. Signed-off-by: Siddhesh Poyarekar Link: http://lkml.kernel.org/r/1330273075-2949-1-git-send-email-siddhesh.poyarekar@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1fd94bc4279d..eb54dd0fbed6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -341,6 +341,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); -- cgit v1.2.3 From f0ba662a6e06f2fb58201800eff33dcad9246f97 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 12:09:21 +0000 Subject: x86: Properly _init-annotate NMI selftest code After all, this code is being run once at boot only (if configured in at all). Signed-off-by: Jan Beulich Acked-by: Don Zickus Link: http://lkml.kernel.org/r/4F478C010200007800074A3D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 0d01a8ea4e11..2c39dcd510fa 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -20,35 +21,35 @@ #define FAILURE 1 #define TIMEOUT 2 -static int nmi_fail; +static int __initdata nmi_fail; /* check to see if NMI IPIs work on this machine */ -static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; -static int testcase_total; -static int testcase_successes; -static int expected_testcase_failures; -static int unexpected_testcase_failures; -static int unexpected_testcase_unknowns; +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; -static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) { unexpected_testcase_unknowns++; return NMI_HANDLED; } -static void init_nmi_testsuite(void) +static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } -static void cleanup_nmi_testsuite(void) +static void __init cleanup_nmi_testsuite(void) { unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); } -static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) { int cpu = raw_smp_processor_id(); @@ -58,7 +59,7 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) return NMI_DONE; } -static void test_nmi_ipi(struct cpumask *mask) +static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; @@ -86,7 +87,7 @@ static void test_nmi_ipi(struct cpumask *mask) return; } -static void remote_ipi(void) +static void __init remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); @@ -94,19 +95,19 @@ static void remote_ipi(void) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void local_ipi(void) +static void __init local_ipi(void) { cpumask_clear(to_cpumask(nmi_ipi_mask)); cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -static void reset_nmi(void) +static void __init reset_nmi(void) { nmi_fail = 0; } -static void dotest(void (*testcase_fn)(void), int expected) +static void __init dotest(void (*testcase_fn)(void), int expected) { testcase_fn(); /* @@ -131,12 +132,12 @@ static void dotest(void (*testcase_fn)(void), int expected) reset_nmi(); } -static inline void print_testname(const char *testname) +static inline void __init print_testname(const char *testname) { printk("%12s:", testname); } -void nmi_selftest(void) +void __init nmi_selftest(void) { init_nmi_testsuite(); -- cgit v1.2.3 From d93c4071b78f4676ef70ec8f2d4bae59b6cc5523 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:50:27 +0000 Subject: x86/time: Eliminate unused irq0_irqs counter As of v2.6.38 this counter is being maintained without ever being read. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4787930200007800074A10@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 1 - arch/x86/kernel/time.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index da0b3ca815b7..382f75d735f3 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -7,7 +7,6 @@ typedef struct { unsigned int __softirq_pending; unsigned int __nmi_count; /* arch dependent */ - unsigned int irq0_irqs; #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dd5fbf4101fc..c6eba2b42673 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc); */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ -- cgit v1.2.3 From 928282e432ee584129a39da831ffa72c38e189b7 Mon Sep 17 00:00:00 2001 From: Mark Wielaard Date: Fri, 24 Feb 2012 11:32:05 +0100 Subject: x86-64: Fix CFI data for common_interrupt() Commit eab9e6137f23 ("x86-64: Fix CFI data for interrupt frames") introduced a DW_CFA_def_cfa_expression in the SAVE_ARGS_IRQ macro. To later define the CFA using a simple register+offset rule both register and offset need to be supplied. Just using CFI_DEF_CFA_REGISTER leaves the offset undefined. So use CFI_DEF_CFA with reg+off explicitly at the end of common_interrupt. Signed-off-by: Mark Wielaard Acked-by: Jan Beulich Link: http://lkml.kernel.org/r/1330079527-30711-1-git-send-email-mjw@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3fe8239fd8fb..54be36bf2620 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -813,7 +813,7 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA_REGISTER rsi + CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ leaq ARGOFFSET-RBP(%rsi), %rsp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET -- cgit v1.2.3 From 0bf6276392e990dd0da0ccd8e10f42597d503f29 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 27 Feb 2012 14:09:10 -0800 Subject: x32: Warn and disable rather than error if binutils too old If X32 is enabled in .config, but the binutils can't build it, issue a warning and disable the feature rather than erroring out. In order to support this, have CONFIG_X86_X32 be the option set in Kconfig, and CONFIG_X86_X32_ABI be the option set by the Makefile when it is enabled and binutils has been found to be functional. Requested-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H. J. Lu Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/Kconfig | 4 ++-- arch/x86/Makefile | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9d6c9ed27e5..e2b38b4bffdc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2175,7 +2175,7 @@ config IA32_AOUT ---help--- Support old a.out binaries in the 32bit emulation. -config X86_X32_ABI +config X86_X32 bool "x32 ABI for 64-bit mode (EXPERIMENTAL)" depends on X86_64 && IA32_EMULATION && EXPERIMENTAL ---help--- @@ -2190,7 +2190,7 @@ config X86_X32_ABI config COMPAT def_bool y - depends on IA32_EMULATION || X86_X32_ABI + depends on IA32_EMULATION || X86_X32 config COMPAT_FOR_U64_ALIGNMENT def_bool COMPAT diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 209ba1294592..31bb1eb1216a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -82,6 +82,22 @@ ifdef CONFIG_CC_STACKPROTECTOR endif endif +ifdef CONFIG_X86_X32 + x32_ld_ok := $(call try-run,\ + /bin/echo -e '1: .quad 1b' | \ + $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \ + $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \ + $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n) + ifeq ($(x32_ld_ok),y) + CONFIG_X86_X32_ABI := y + KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI + KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI + else + $(warning CONFIG_X86_X32 enabled but no binutils support) + endif +endif +export CONFIG_X86_X32_ABI + # Don't unroll struct assignments with kmemcheck enabled ifeq ($(CONFIG_KMEMCHECK),y) KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) -- cgit v1.2.3 From 8bd69c2d5f9c0b5237c632d1b21dbfe4fd16ba6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 28 Feb 2012 10:35:06 +0100 Subject: x86/x32: Fix the binutils auto-detect Fix: arch/x86/Makefile:96: *** recipe commences before first target. Stop. Cc: H. Peter Anvin Cc: H. J. Lu Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 31bb1eb1216a..968dbe24a255 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -88,13 +88,13 @@ ifdef CONFIG_X86_X32 $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \ $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \ $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n) - ifeq ($(x32_ld_ok),y) - CONFIG_X86_X32_ABI := y - KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI - KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI - else - $(warning CONFIG_X86_X32 enabled but no binutils support) - endif + ifeq ($(x32_ld_ok),y) + CONFIG_X86_X32_ABI := y + KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI + KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI + else + $(warning CONFIG_X86_X32 enabled but no binutils support) + endif endif export CONFIG_X86_X32_ABI -- cgit v1.2.3 From 55f9709cd07c9d33e30b575ee1b3bfd0aeaa3760 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 28 Feb 2012 13:37:21 +0000 Subject: x86, relocs: Don't open code put_unaligned_le32() Use the new headers in tools/include instead of rolling our own put_unaligned_le32() implementation. Cc: H. Peter Anvin Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1330436245-24875-3-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/relocs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index 89bbf4e4d05d..d3c0b0277666 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -10,6 +10,7 @@ #define USE_BSD #include #include +#include static void die(char *fmt, ...); @@ -605,10 +606,7 @@ static void emit_relocs(int as_text) fwrite("\0\0\0\0", 4, 1, stdout); /* Now print each relocation */ for (i = 0; i < reloc_count; i++) { - buf[0] = (relocs[i] >> 0) & 0xff; - buf[1] = (relocs[i] >> 8) & 0xff; - buf[2] = (relocs[i] >> 16) & 0xff; - buf[3] = (relocs[i] >> 24) & 0xff; + put_unaligned_le32(relocs[i], buf); fwrite(buf, 4, 1, stdout); } } -- cgit v1.2.3 From 12871c568305a0b20f116315479a18cd46882e9b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 28 Feb 2012 13:37:22 +0000 Subject: x86, mkpiggy: Don't open code put_unaligned_le32() Use the new headers in tools/include instead of rolling our own put_unaligned_le32() implementation. Cc: H. Peter Anvin Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1330436245-24875-4-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/mkpiggy.c | 11 ++--------- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index b123b9a8f5b3..fd55a2ff3ad8 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -22,6 +22,7 @@ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy +HOST_EXTRACFLAGS += -I$(srctree)/tools/include VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 46a823882437..958a641483dd 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -29,14 +29,7 @@ #include #include #include - -static uint32_t getle32(const void *p) -{ - const uint8_t *cp = p; - - return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) + - ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24); -} +#include int main(int argc, char *argv[]) { @@ -69,7 +62,7 @@ int main(int argc, char *argv[]) } ilen = ftell(f); - olen = getle32(&olen); + olen = get_unaligned_le32(&olen); fclose(f); /* -- cgit v1.2.3 From d40f833630a1299fd377408dc8d8fac370d621b0 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 28 Feb 2012 13:37:23 +0000 Subject: x86, boot: Restrict CFLAGS for hostprogs Currently tools/build has access to all the kernel headers in $(srctree). This is unnecessary and could potentially allow tools/build to erroneously include kernel headers when it should only be including userspace-exported headers. Unfortunately, mkcpustr still needs access to some of the asm kernel headers, so explicitly special case that hostprog. Cc: H. Peter Anvin Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1330436245-24875-5-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 95365a82b6a0..3e02148bb774 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -37,8 +37,9 @@ setup-y += video-bios.o targets += $(setup-y) hostprogs-y := mkcpustr tools/build -HOST_EXTRACFLAGS += $(LINUXINCLUDE) - +HOSTCFLAGS_mkcpustr.o := -I$(srctree)/arch/$(SRCARCH)/include +HOST_EXTRACFLAGS += -I$(objtree)/include -I$(srctree)/tools/include \ + -include $(srctree)/include/linux/kconfig.h $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ -- cgit v1.2.3 From 92f42c50f227ad228f815a8f4eec872524dae3a5 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 28 Feb 2012 13:37:24 +0000 Subject: x86, efi: Fix endian issues and unaligned accesses We may need to convert the endianness of the data we read from/write to 'buf', so let's use {get,put}_unaligned_le32() to do that. Failure to do so can result in accessing invalid memory, leading to a segfault. Stephen Rothwell noticed this bug while cross-building an x86_64 allmodconfig kernel on PowerPC. We need to read from and write to 'buf' a byte at a time otherwise it's possible we'll perform an unaligned access, which can lead to bus errors when cross-building an x86 kernel on risc architectures. Cc: H. Peter Anvin Cc: Nick Bowler Tested-by: Stephen Rothwell Reported-by: Stephen Rothwell Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1330436245-24875-6-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/tools/build.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 4e9bd6bcafa6..f2ac95ece0cc 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -34,6 +34,7 @@ #include #include #include +#include typedef unsigned char u8; typedef unsigned short u16; @@ -41,6 +42,7 @@ typedef unsigned long u32; #define DEFAULT_MAJOR_ROOT 0 #define DEFAULT_MINOR_ROOT 0 +#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT) /* Minimal number of setup sectors */ #define SETUP_SECT_MIN 5 @@ -159,7 +161,7 @@ int main(int argc, char ** argv) die("read-error on `setup'"); if (c < 1024) die("The setup must be at least 1024 bytes"); - if (buf[510] != 0x55 || buf[511] != 0xaa) + if (get_unaligned_le16(&buf[510]) != 0xAA55) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); @@ -171,8 +173,7 @@ int main(int argc, char ** argv) memset(buf+c, 0, i-c); /* Set the default root device */ - buf[508] = DEFAULT_MINOR_ROOT; - buf[509] = DEFAULT_MAJOR_ROOT; + put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i); @@ -192,44 +193,42 @@ int main(int argc, char ** argv) /* Patch the setup code with the appropriate size parameters */ buf[0x1f1] = setup_sectors-1; - buf[0x1f4] = sys_size; - buf[0x1f5] = sys_size >> 8; - buf[0x1f6] = sys_size >> 16; - buf[0x1f7] = sys_size >> 24; + put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB file_sz = sz + i + ((sys_size * 16) - sz); - pe_header = *(unsigned int *)&buf[0x3c]; + pe_header = get_unaligned_le32(&buf[0x3c]); /* Size of code */ - *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]); /* Size of image */ - *(unsigned int *)&buf[pe_header + 0x50] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); #ifdef CONFIG_X86_32 /* Address of entry point */ - *(unsigned int *)&buf[pe_header + 0x28] = i; + put_unaligned_le32(i, &buf[pe_header + 0x28]); /* .text size */ - *(unsigned int *)&buf[pe_header + 0xb0] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]); /* .text size of initialised data */ - *(unsigned int *)&buf[pe_header + 0xb8] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]); #else /* * Address of entry point. startup_32 is at the beginning and * the 64-bit entry point (startup_64) is always 512 bytes * after. */ - *(unsigned int *)&buf[pe_header + 0x28] = i + 512; + put_unaligned_le32(i + 512, &buf[pe_header + 0x28]); /* .text size */ - *(unsigned int *)&buf[pe_header + 0xc0] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]); /* .text size of initialised data */ - *(unsigned int *)&buf[pe_header + 0xc8] = file_sz; + put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]); + #endif /* CONFIG_X86_32 */ #endif /* CONFIG_EFI_STUB */ -- cgit v1.2.3 From 8411371709610c826bf65684f886bfdfb5780ca1 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Feb 2012 11:51:10 -0700 Subject: x86/PCI: use host bridge _CRS info on MSI MS-7253 In the spirit of commit 29cf7a30f8a0 ("x86/PCI: use host bridge _CRS info on ASUS M2V-MX SE"), this DMI quirk turns on "pci_use_crs" by default on a board that needs it. This fixes boot failures and oopses introduced in 3e3da00c01d0 ("x86/pci: AMD one chain system to use pci read out res"). The quirk is quite targetted (to a specific board and BIOS version) for two reasons: (1) to emphasize that this method of tackling the problem one quirk at a time is a little insane (2) to give BIOS vendors an opportunity to use simpler tables and allow us to return to generic behavior (whatever that happens to be) with a later BIOS update In other words, I am not at all happy with having quirks like this. But it is even worse for the kernel not to work out of the box on these machines, so... Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42619 Reported-by: Svante Signell Signed-off-by: Jonathan Nieder Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index c33e0970ee9f..7034c081b226 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -60,6 +60,17 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=42619 */ + { + .callback = set_use_crs, + .ident = "MSI MS-7253", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), + DMI_MATCH(DMI_BOARD_NAME, "MS-7253"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.6"), + }, + }, /* Now for the blacklist.. */ -- cgit v1.2.3 From f649e9388cd46ad1634164e56f96ae092ca59e4a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 20 Jan 2012 16:24:09 -0500 Subject: x86: relocate get/set debugreg fcns to include/asm/debugreg. Since we already have a debugreg.h header file, move the assoc. get/set functions to it. In addition to it being the logical home for them, it has a secondary advantage. The functions that are moved use BUG(). So we really need to have linux/bug.h in scope. But asm/processor.h is used about 600 times, vs. only about 15 for debugreg.h -- so adding bug.h to the latter reduces the amount of time we'll be processing it during a compile. Signed-off-by: Paul Gortmaker Acked-by: Ingo Molnar CC: Thomas Gleixner CC: "H. Peter Anvin" --- arch/x86/include/asm/debugreg.h | 67 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 63 ------------------------------------- arch/x86/kernel/cpu/common.c | 1 + 3 files changed, 68 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b903d5ea3941..2d91580bf228 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -78,8 +78,75 @@ */ #ifdef __KERNEL__ +#include + DECLARE_PER_CPU(unsigned long, cpu_dr7); +#ifndef CONFIG_PARAVIRT +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) +#endif + +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("mov %0, %%db0" ::"r" (value)); + break; + case 1: + asm("mov %0, %%db1" ::"r" (value)); + break; + case 2: + asm("mov %0, %%db2" ::"r" (value)); + break; + case 3: + asm("mov %0, %%db3" ::"r" (value)); + break; + case 6: + asm("mov %0, %%db6" ::"r" (value)); + break; + case 7: + asm("mov %0, %%db7" ::"r" (value)); + break; + default: + BUG(); + } +} + static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 58545c97d071..30aa6e95f814 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -474,61 +474,6 @@ struct thread_struct { unsigned io_bitmap_max; }; -static inline unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("mov %%db0, %0" :"=r" (val)); - break; - case 1: - asm("mov %%db1, %0" :"=r" (val)); - break; - case 2: - asm("mov %%db2, %0" :"=r" (val)); - break; - case 3: - asm("mov %%db3, %0" :"=r" (val)); - break; - case 6: - asm("mov %%db6, %0" :"=r" (val)); - break; - case 7: - asm("mov %%db7, %0" :"=r" (val)); - break; - default: - BUG(); - } - return val; -} - -static inline void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("mov %0, %%db0" ::"r" (value)); - break; - case 1: - asm("mov %0, %%db1" ::"r" (value)); - break; - case 2: - asm("mov %0, %%db2" ::"r" (value)); - break; - case 3: - asm("mov %0, %%db3" ::"r" (value)); - break; - case 6: - asm("mov %0, %%db6" ::"r" (value)); - break; - case 7: - asm("mov %0, %%db7" ::"r" (value)); - break; - default: - BUG(); - } -} - /* * Set IOPL bits in EFLAGS from given mask */ @@ -574,14 +519,6 @@ static inline void native_swapgs(void) #define __cpuid native_cpuid #define paravirt_enabled() 0 -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = native_get_debugreg(register) -#define set_debugreg(value, register) \ - native_set_debugreg(register, value) - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318f..0d676dd923ac 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From b8d43cb504a94f1070159a37c8cb23008276eff3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 28 Feb 2012 23:30:58 -0800 Subject: x86, tools: Remove unneeded header files from tools/build.c We include and , but none of those header files actually provide anything this file needs. Furthermore, it breaks cross-compilation, so just remove them. Reported-by: Stephen Rothwell Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Matt Fleming Cc: Andrew Morton Cc: Nick Bowler Link: http://lkml.kernel.org/r/20120229111322.9eb4b23ff1672e8853ad3b3b@canb.auug.org.au --- arch/x86/boot/tools/build.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index f2ac95ece0cc..f3bd2e676d2a 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -29,11 +29,9 @@ #include #include #include -#include #include #include #include -#include #include typedef unsigned char u8; -- cgit v1.2.3 From a51f4047758d2bcd099ea113b833ed380f4024ba Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 28 Feb 2012 23:36:21 -0800 Subject: x86, build: Fix portability issues when cross-building It would appear that we never actually generated a correct CRC when building on a bigendian machine. Depending on the word size, we would either generate an all-zero CRC (64-bit machine) or a byte-swapped CRC (32-bit machine.) Fix the types used so we don't arbitrarily use a 64-bit word to hold 32-bit numbers, and pass the CRC through put_unaligned_le32() like all the other numbers. Signed-off-by: H. Peter Anvin Cc: Stephen Rothwell Cc: Matt Fleming Cc: Andrew Morton Cc: Nick Bowler Link: http://lkml.kernel.org/r/20120229111322.9eb4b23ff1672e8853ad3b3b@canb.auug.org.au --- arch/x86/boot/tools/build.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index f3bd2e676d2a..ed549767a231 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -36,7 +36,7 @@ typedef unsigned char u8; typedef unsigned short u16; -typedef unsigned long u32; +typedef unsigned int u32; #define DEFAULT_MAJOR_ROOT 0 #define DEFAULT_MINOR_ROOT 0 @@ -247,8 +247,9 @@ int main(int argc, char ** argv) } /* Write the CRC */ - fprintf(stderr, "CRC %lx\n", crc); - if (fwrite(&crc, 1, 4, stdout) != 4) + fprintf(stderr, "CRC %x\n", crc); + put_unaligned_le32(crc, buf); + if (fwrite(buf, 1, 4, stdout) != 4) die("Writing CRC failed"); close(fd); -- cgit v1.2.3 From 50af5ead3b44ccf8bd2b4d2a50c1b610f557c480 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 20 Jan 2012 18:35:53 -0500 Subject: bug.h: add include of it to various implicit C users With bug.h currently living right in linux/kernel.h there are files that use BUG_ON and friends but are not including the header explicitly. Fix them up so we can remove the presence in kernel.h file. Signed-off-by: Paul Gortmaker --- arch/arm/mach-imx/cpu_op-mx51.c | 1 + arch/arm/mach-ux500/board-mop500-pins.c | 1 + arch/mips/fw/arc/cmdline.c | 1 + arch/mips/fw/arc/identify.c | 1 + arch/parisc/math-emu/fpudispatch.c | 1 + arch/powerpc/kernel/pmc.c | 1 + arch/powerpc/xmon/ppc-opc.c | 1 + arch/powerpc/xmon/spu-opc.c | 1 + arch/x86/kernel/paravirt.c | 1 + arch/x86/mm/kmemcheck/selftest.c | 1 + drivers/gpu/drm/radeon/cayman_blit_shaders.c | 1 + drivers/gpu/drm/radeon/evergreen_blit_shaders.c | 1 + drivers/gpu/drm/radeon/r600_blit_shaders.c | 1 + drivers/staging/wlags49_h2/hcf.c | 1 + lib/atomic64_test.c | 1 + lib/bitmap.c | 1 + lib/iommu-helper.c | 1 + lib/list_debug.c | 1 + lib/plist.c | 1 + lib/string.c | 1 + lib/timerqueue.c | 1 + 21 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/arm/mach-imx/cpu_op-mx51.c b/arch/arm/mach-imx/cpu_op-mx51.c index 9d34c3d4c024..7b92cd6da6d3 100644 --- a/arch/arm/mach-imx/cpu_op-mx51.c +++ b/arch/arm/mach-imx/cpu_op-mx51.c @@ -11,6 +11,7 @@ * http://www.gnu.org/copyleft/gpl.html */ +#include #include #include #include diff --git a/arch/arm/mach-ux500/board-mop500-pins.c b/arch/arm/mach-ux500/board-mop500-pins.c index 74bfcff2bdf3..f5413dca532c 100644 --- a/arch/arm/mach-ux500/board-mop500-pins.c +++ b/arch/arm/mach-ux500/board-mop500-pins.c @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/arch/mips/fw/arc/cmdline.c b/arch/mips/fw/arc/cmdline.c index 9fdf07e50f1b..c0122a1dc587 100644 --- a/arch/mips/fw/arc/cmdline.c +++ b/arch/mips/fw/arc/cmdline.c @@ -7,6 +7,7 @@ * * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ +#include #include #include #include diff --git a/arch/mips/fw/arc/identify.c b/arch/mips/fw/arc/identify.c index 788060a53dce..54a33c756f61 100644 --- a/arch/mips/fw/arc/identify.c +++ b/arch/mips/fw/arc/identify.c @@ -11,6 +11,7 @@ * * Copyright (C) 1996 David S. Miller (davem@davemloft.net) */ +#include #include #include #include diff --git a/arch/parisc/math-emu/fpudispatch.c b/arch/parisc/math-emu/fpudispatch.c index 6e28f9f4c620..673b73e8420d 100644 --- a/arch/parisc/math-emu/fpudispatch.c +++ b/arch/parisc/math-emu/fpudispatch.c @@ -50,6 +50,7 @@ #define FPUDEBUG 0 #include "float.h" +#include #include #include /* #include */ diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index a841a9d136a2..58eaa3ddf7b9 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -13,6 +13,7 @@ */ #include +#include #include #include diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c index af3780e52e76..6845e91ba04a 100644 --- a/arch/powerpc/xmon/ppc-opc.c +++ b/arch/powerpc/xmon/ppc-opc.c @@ -22,6 +22,7 @@ #include #include +#include #include "nonstdio.h" #include "ppc.h" diff --git a/arch/powerpc/xmon/spu-opc.c b/arch/powerpc/xmon/spu-opc.c index 530df3d6d7b2..7d37597c4bcd 100644 --- a/arch/powerpc/xmon/spu-opc.c +++ b/arch/powerpc/xmon/spu-opc.c @@ -19,6 +19,7 @@ 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ #include +#include #include "spu.h" /* This file holds the Spu opcode table */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..83e7b81d2135 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 036efbea8b28..aef7140c0063 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +#include #include #include "opcode.h" diff --git a/drivers/gpu/drm/radeon/cayman_blit_shaders.c b/drivers/gpu/drm/radeon/cayman_blit_shaders.c index 7b4eeb7b4a8c..19a0114d2e3b 100644 --- a/drivers/gpu/drm/radeon/cayman_blit_shaders.c +++ b/drivers/gpu/drm/radeon/cayman_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c index 3a10399e0066..f85c0af115b5 100644 --- a/drivers/gpu/drm/radeon/evergreen_blit_shaders.c +++ b/drivers/gpu/drm/radeon/evergreen_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.c b/drivers/gpu/drm/radeon/r600_blit_shaders.c index 2d1f6c5ee2a7..3af3c6426a6e 100644 --- a/drivers/gpu/drm/radeon/r600_blit_shaders.c +++ b/drivers/gpu/drm/radeon/r600_blit_shaders.c @@ -24,6 +24,7 @@ * Alex Deucher */ +#include #include #include diff --git a/drivers/staging/wlags49_h2/hcf.c b/drivers/staging/wlags49_h2/hcf.c index b008773323b3..5957c3a439ac 100644 --- a/drivers/staging/wlags49_h2/hcf.c +++ b/drivers/staging/wlags49_h2/hcf.c @@ -91,6 +91,7 @@ #include "hcf.h" // HCF and MSF common include file #include "hcfdef.h" // HCF specific include file #include "mmd.h" // MoreModularDriver common include file +#include #include #if ! defined offsetof diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 0c33cde2a1e6..cb99b91c3a1d 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -9,6 +9,7 @@ * (at your option) any later version. */ #include +#include #include #include diff --git a/lib/bitmap.c b/lib/bitmap.c index 0d4a127dd9b3..90a683b34075 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index da053313ee5c..8b1ab6222562 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -4,6 +4,7 @@ #include #include +#include int iommu_is_span_boundary(unsigned int index, unsigned int nr, unsigned long shift, diff --git a/lib/list_debug.c b/lib/list_debug.c index 7204e619a4c1..1bf2fe36f813 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -8,6 +8,7 @@ #include #include +#include #include /* diff --git a/lib/plist.c b/lib/plist.c index a0a4da489c22..6ab0e521c48b 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -23,6 +23,7 @@ * information. */ +#include #include #include diff --git a/lib/string.c b/lib/string.c index dc4a86341f91..0573a20df9a6 100644 --- a/lib/string.c +++ b/lib/string.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #ifndef __HAVE_ARCH_STRNICMP diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 191176a43e9a..14c640355eb1 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -22,6 +22,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include -- cgit v1.2.3 From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:33:18 +0100 Subject: sched/rt: Use schedule_preempt_disabled() Coccinelle based conversion. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/process.c | 4 +--- arch/avr32/kernel/process.c | 4 +--- arch/blackfin/kernel/process.c | 4 +--- arch/cris/kernel/process.c | 4 +--- arch/frv/kernel/process.c | 4 +--- arch/h8300/kernel/process.c | 4 +--- arch/ia64/kernel/process.c | 4 +--- arch/m32r/kernel/process.c | 4 +--- arch/m68k/kernel/process_mm.c | 4 +--- arch/m68k/kernel/process_no.c | 4 +--- arch/microblaze/kernel/process.c | 4 +--- arch/mips/kernel/process.c | 4 +--- arch/mn10300/kernel/process.c | 4 +--- arch/parisc/kernel/process.c | 4 +--- arch/powerpc/kernel/idle.c | 8 ++++---- arch/powerpc/platforms/iseries/setup.c | 8 ++------ arch/s390/kernel/process.c | 4 +--- arch/score/kernel/process.c | 4 +--- arch/sh/kernel/idle.c | 4 +--- arch/sparc/kernel/process_32.c | 8 ++------ arch/sparc/kernel/process_64.c | 10 ++++------ arch/tile/kernel/process.c | 4 +--- arch/x86/kernel/process_32.c | 4 +--- arch/x86/kernel/process_64.c | 4 +--- arch/xtensa/kernel/process.c | 4 +--- init/main.c | 5 +---- kernel/mutex.c | 4 +--- kernel/softirq.c | 4 +--- 28 files changed, 36 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 971d65c253a9..c2ae3cd331fe 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -239,9 +239,7 @@ void cpu_idle(void) leds_event(led_idle_end); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ea3395750324..92c5af98a6f7 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -40,9 +40,7 @@ void cpu_idle(void) cpu_idle_sleep(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 8dd0416673cb..a80a643f3691 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -94,9 +94,7 @@ void cpu_idle(void) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index aa585e4e979e..d8f50ff6fadd 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -115,9 +115,7 @@ void cpu_idle (void) idle = default_idle; idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 3901df1213c0..29cc49783787 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 933bd388efb2..1a173b35f475 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -81,9 +81,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6d33c5cc94f0..9dc52b63fc87 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -330,9 +330,7 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 422bea9f1dbc..3a4a32b27208 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -90,9 +90,7 @@ void cpu_idle (void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_mm.c b/arch/m68k/kernel/process_mm.c index 099283ee1a8f..fe4186b5fc32 100644 --- a/arch/m68k/kernel/process_mm.c +++ b/arch/m68k/kernel/process_mm.c @@ -78,9 +78,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_no.c b/arch/m68k/kernel/process_no.c index 5e1078cabe0e..f7fe6c348595 100644 --- a/arch/m68k/kernel/process_no.c +++ b/arch/m68k/kernel/process_no.c @@ -73,9 +73,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 7dcb5bfffb75..9155f7d92669 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -110,9 +110,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 7955409051c4..61f1cb45a1d5 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -80,9 +80,7 @@ void __noreturn cpu_idle(void) #endif rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 28eec3102535..cac401d37f75 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -123,9 +123,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 62c60b87d039..d4b94b395c16 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -71,9 +71,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 0a48bf5db6c8..65035141552b 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -101,11 +101,11 @@ void cpu_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - if (cpu_should_die()) + if (cpu_should_die()) { + preempt_enable_no_resched(); cpu_die(); - schedule(); - preempt_disable(); + } + schedule_preempt_disabled(); } } diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 8fc62586a973..a5fbf4cb6329 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -584,9 +584,7 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } @@ -615,9 +613,7 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e795933eb2cb..7618085b4164 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -97,9 +97,7 @@ void cpu_idle(void) tick_nohz_idle_exit(); if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index 25d08030a883..2707023c7563 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -53,9 +53,7 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 406508d4ce74..7e4892826563 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -114,9 +114,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f793742eec2b..935fdbcd88c2 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } @@ -138,9 +136,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 39d8b05201a2..ab9a29268213 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -104,15 +104,13 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) { + preempt_enable_no_resched(); cpu_play_dead(); + } #endif - - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 4c1ac6e5347a..6ae495ef2b99 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -108,9 +108,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..49888fefe794 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -119,9 +119,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..e34257c70c28 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -156,9 +156,7 @@ void cpu_idle(void) } tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 47041e7c088c..2c9004770c4e 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/init/main.c b/init/main.c index ff49a6dacfbb..4990f7ec776a 100644 --- a/init/main.c +++ b/init/main.c @@ -374,11 +374,8 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); - + schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ - preempt_disable(); cpu_idle(); } diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..79b524767a24 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From a97f4f5e524bcd09a85ef0b8821a14d35e69335f Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Tue, 28 Feb 2012 15:31:35 -0600 Subject: x86/PCI: do not tie MSI MS-7253 use_crs quirk to BIOS version Carlos was getting WARNING: at drivers/pci/pci.c:118 pci_ioremap_bar+0x24/0x52() when probing his sound card, and sound did not work. After adding pci=use_crs to the kernel command line, no more trouble. Ok, we can add a quirk. dmidecode output reveals that this is an MSI MS-7253, for which we already have a quirk, but the short-sighted author tied the quirk to a single BIOS version, making it not kick in on Carlos's machine with BIOS V1.2. If a later BIOS update makes it no longer necessary to look at the _CRS info it will still be harmless, so let's stop trying to guess which versions have and don't have accurate _CRS tables. Addresses https://bugtrack.alsa-project.org/alsa-bug/view.php?id=5533 Also see . Reported-by: Carlos Luna Reviewed-by: Bjorn Helgaas Signed-off-by: Jonathan Nieder Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 7034c081b226..49a5cb55429b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -68,7 +68,6 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"), DMI_MATCH(DMI_BOARD_NAME, "MS-7253"), DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "V1.6"), }, }, -- cgit v1.2.3 From b263b31e8ad65cdbfa5a7f739460f350554a2dc1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 27 Feb 2012 15:15:25 -0800 Subject: x86, mtrr: Use explicit sizing and padding for the 64-bit ioctls Specify the data structures for the 64-bit ioctls with explicit sizing and padding so that the x32 kernel will correctly use the 64-bit forms of these ioctls. Note that these ioctls are bogus in both forms on both 32 and 64 bits; even on 64 bits the maximum MTRR size is only 44 bits long. Note that nothing really is supposed to use these ioctls and that the preferred interface is text strings on /proc/mtrr, or better yet, nothing at all (use /sys/bus/pci/devices/*/resource*_wc for write combining; that uses PAT not MTRRs.) Signed-off-by: H. Peter Anvin Cc: H. J. Lu Tested-by: Nitin A. Kamble Link: http://lkml.kernel.org/n/tip-vwvnlu3hjmtkwvij4qxtm90l@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 28 ++++++++++++++++++---------- arch/x86/kernel/cpu/mtrr/if.c | 10 ++++++---- 2 files changed, 24 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 4365ffdb461f..7e3f17f92c66 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -29,18 +29,18 @@ #define MTRR_IOCTL_BASE 'M' -struct mtrr_sentry { - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int type; /* Type of region */ -}; - /* Warning: this structure has a different order from i386 on x86-64. The 32bit emulation code takes care of that. But you need to use this for 64bit, otherwise your X server will break. */ #ifdef __i386__ +struct mtrr_sentry { + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + struct mtrr_gentry { unsigned int regnum; /* Register number */ unsigned long base; /* Base address */ @@ -50,12 +50,20 @@ struct mtrr_gentry { #else /* __i386__ */ +struct mtrr_sentry { + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 type; /* Type of region */ +}; + struct mtrr_gentry { - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int regnum; /* Register number */ - unsigned int type; /* Type of region */ + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 regnum; /* Register number */ + __u32 type; /* Type of region */ + __u32 _pad; /* Unused */ }; + #endif /* !__i386__ */ struct mtrr_var_range { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 79289632cb27..a041e094b8b9 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long base; unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; @@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; + gentry.base = base << PAGE_SHIFT; gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that would overflow */ if (size != (__typeof__(gentry.size))size) gentry.base = gentry.size = gentry.type = 0; else { + gentry.base = base; gentry.size = size; gentry.type = type; } -- cgit v1.2.3 From 1018faa6cf23b256bf25919ef203cd7c129f06f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Feb 2012 14:57:32 +0100 Subject: perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled It turned out that a performance counter on AMD does not count at all when the GO or HO bit is set in the control register and SVM is disabled in EFER. This patch works around this issue by masking out the HO bit in the performance counter control register when SVM is not enabled. The GO bit is not touched because it is only set when the user wants to count in guest-mode only. So when SVM is disabled the counter should not run at all and the not-counting is the intended behaviour. Signed-off-by: Joerg Roedel Signed-off-by: Peter Zijlstra Cc: Avi Kivity Cc: Stephane Eranian Cc: David Ahern Cc: Gleb Natapov Cc: Robert Richter Cc: stable@vger.kernel.org # v3.2 Link: http://lkml.kernel.org/r/1330523852-19566-1-git-send-email-joerg.roedel@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 8 ++++++++ arch/x86/kernel/cpu/perf_event.h | 8 ++++++-- arch/x86/kernel/cpu/perf_event_amd.c | 37 ++++++++++++++++++++++++++++++++++-- arch/x86/kvm/svm.c | 5 +++++ 4 files changed, 54 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..461ce432b1c2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -242,4 +242,12 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) static inline void perf_events_lapic_init(void) { } #endif +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + extern void amd_pmu_enable_virt(void); + extern void amd_pmu_disable_virt(void); +#else + static inline void amd_pmu_enable_virt(void) { } + static inline void amd_pmu_disable_virt(void) { } +#endif + #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..c30c807ddc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -147,7 +147,9 @@ struct cpu_hw_events { /* * AMD specific bits */ - struct amd_nb *amd_nb; + struct amd_nb *amd_nb; + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ + u64 perf_ctr_virt_mask; void *kfree_on_online; }; @@ -417,9 +419,11 @@ void x86_pmu_disable_all(void); static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); + if (hwc->extra_reg.reg) wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); - wrmsrl(hwc->config_base, hwc->config | enable_mask); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); } void x86_pmu_enable_all(int added); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 0397b23be8e9..67250a52430b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu) struct amd_nb *nb; int i, nb_id; - if (boot_cpu_data.x86_max_cores < 2) + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) return; nb_id = amd_get_nb_id(cpu); @@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .put_event_constraints = amd_put_event_constraints, .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, #endif + .cpu_starting = amd_pmu_cpu_starting, }; __init int amd_pmu_init(void) @@ -621,3 +624,33 @@ __init int amd_pmu_init(void) return 0; } + +void amd_pmu_enable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe56..e385214711cb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -575,6 +576,8 @@ static void svm_hardware_disable(void *garbage) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); + + amd_pmu_disable_virt(); } static int svm_hardware_enable(void *garbage) @@ -622,6 +625,8 @@ static int svm_hardware_enable(void *garbage) svm_init_erratum_383(); + amd_pmu_enable_virt(); + return 0; } -- cgit v1.2.3 From 63ab387ca0d1576edef35ef68e4b8ea5e0757b7a Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Fri, 2 Mar 2012 12:45:01 -0700 Subject: x86/PCI: add spinlock held check to 'pcibios_fwaddrmap_lookup()' 'pcibios_fwaddrmap_lookup()' is used to maintain FW-assigned BIOS BAR values for reinstatement when normal resource assignment attempts fail and must be called with the 'pcibios_fwaddrmap_lock' spinlock held. This patch adds a WARN_ON notification if the spinlock is not currently held by the caller. Signed-off-by: Myron Stowe Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 33e6a0b995fc..831971e731f7 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -57,6 +57,8 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) { struct pcibios_fwaddrmap *map; + WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock)); + list_for_each_entry(map, &pcibios_fwaddrmappings, list) if (map->dev == dev) return map; -- cgit v1.2.3 From e37aade31601cdb9f078f6663cbf887f391bb110 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 28 Feb 2012 16:16:33 +0100 Subject: x86, memblock: Move mem_hole_size() to .init mem_hole_size() is being called only from __init-marked functions, and as such should be moved to .init section as well. Fixes this warning: WARNING: vmlinux.o(.text+0x35511): Section mismatch in reference from the function mem_hole_size() to the function .init.text:absent_pages_in_range() Signed-off-by: Jiri Kosina Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1202281614450.31150@pobox.suse.cz Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_emulation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 46db56845f18..2fff6518e302 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -28,7 +28,7 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } -static u64 mem_hole_size(u64 start, u64 end) +static u64 __init mem_hole_size(u64 start, u64 end) { unsigned long start_pfn = PFN_UP(start); unsigned long end_pfn = PFN_DOWN(end); -- cgit v1.2.3 From 187f1882b5b0748b3c4c22274663fdb372ac0452 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 23 Nov 2011 20:12:59 -0500 Subject: BUG: headers with BUG/BUG_ON etc. need linux/bug.h If a header file is making use of BUG, BUG_ON, BUILD_BUG_ON, or any other BUG variant in a static inline (i.e. not in a #define) then that header really should be including and not just expecting it to be implicitly present. We can make this change risk-free, since if the files using these headers didn't have exposure to linux/bug.h already, they would have been causing compile failures/warnings. Signed-off-by: Paul Gortmaker --- arch/avr32/include/asm/io.h | 1 + arch/m68k/include/asm/system.h | 1 + arch/sparc/include/asm/vga.h | 1 + arch/x86/include/asm/paravirt.h | 1 + include/asm-generic/dma-mapping-common.h | 1 + include/asm-generic/pgtable.h | 1 + include/asm-generic/tlbflush.h | 2 ++ include/drm/ttm/ttm_memory.h | 1 + include/linux/atmdev.h | 1 + include/linux/bio.h | 1 + include/linux/bit_spinlock.h | 1 + include/linux/ceph/decode.h | 3 ++- include/linux/ceph/libceph.h | 1 + include/linux/ceph/mdsmap.h | 1 + include/linux/cpumask.h | 1 + include/linux/crypto.h | 1 + include/linux/debug_locks.h | 1 + include/linux/dmaengine.h | 1 + include/linux/elfcore.h | 1 + include/linux/ext3_fs.h | 1 + include/linux/fs.h | 1 + include/linux/fsnotify.h | 1 + include/linux/gpio.h | 1 + include/linux/highmem.h | 1 + include/linux/i2o.h | 1 + include/linux/if_vlan.h | 1 + include/linux/io-mapping.h | 1 + include/linux/kprobes.h | 1 + include/linux/kvm_host.h | 1 + include/linux/memory_hotplug.h | 1 + include/linux/mm.h | 1 + include/linux/mtd/cfi.h | 1 + include/linux/netdevice.h | 1 + include/linux/nilfs2_fs.h | 1 + include/linux/page-flags.h | 1 + include/linux/pid_namespace.h | 1 + include/linux/posix_acl.h | 1 + include/linux/ptrace.h | 1 + include/linux/radix-tree.h | 1 + include/linux/rcupdate.h | 1 + include/linux/regset.h | 1 + include/linux/reiserfs_fs.h | 1 + include/linux/relay.h | 1 + include/linux/scatterlist.h | 6 ++++-- include/linux/seq_file.h | 1 + include/linux/skbuff.h | 1 + include/linux/slub_def.h | 1 + include/linux/ssb/ssb_driver_gige.h | 1 + include/linux/swapops.h | 1 + include/linux/syscalls.h | 1 + include/linux/transport_class.h | 1 + include/linux/virtio_config.h | 1 + include/net/cfg80211.h | 1 + include/net/dst.h | 1 + include/net/ip_vs.h | 1 + include/net/mac80211.h | 1 + include/net/netns/generic.h | 1 + include/net/red.h | 1 + include/net/tcp.h | 1 + include/net/timewait_sock.h | 1 + include/net/udp.h | 1 + include/net/wpan-phy.h | 1 + include/scsi/osd_ore.h | 1 + include/scsi/scsi_transport.h | 1 + 64 files changed, 69 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h index 22c97ef92201..cf60d0a9f176 100644 --- a/arch/avr32/include/asm/io.h +++ b/arch/avr32/include/asm/io.h @@ -1,6 +1,7 @@ #ifndef __ASM_AVR32_IO_H #define __ASM_AVR32_IO_H +#include #include #include #include diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h index 47b01f4726bc..8dc68178716c 100644 --- a/arch/m68k/include/asm/system.h +++ b/arch/m68k/include/asm/system.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h index c69d5b2ba19a..ec0e9967d93d 100644 --- a/arch/sparc/include/asm/vga.h +++ b/arch/sparc/include/asm/vga.h @@ -7,6 +7,7 @@ #ifndef _LINUX_ASM_VGA_H_ #define _LINUX_ASM_VGA_H_ +#include #include #define VT_BUF_HAVE_RW diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..923b07024a03 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -10,6 +10,7 @@ #include #ifndef __ASSEMBLY__ +#include #include #include diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index 9fa3f96e38cf..2e248d8924dc 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_DMA_MAPPING_H #include +#include #include #include #include diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15e..236b1056839f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,6 +5,7 @@ #ifdef CONFIG_MMU #include +#include #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/include/asm-generic/tlbflush.h b/include/asm-generic/tlbflush.h index c7af037024c7..d6d0a88430fe 100644 --- a/include/asm-generic/tlbflush.h +++ b/include/asm-generic/tlbflush.h @@ -9,6 +9,8 @@ #error need to implement an architecture specific asm/tlbflush.h #endif +#include + static inline void flush_tlb_mm(struct mm_struct *mm) { BUG(); diff --git a/include/drm/ttm/ttm_memory.h b/include/drm/ttm/ttm_memory.h index 26c1f78d136f..d6d1da468c97 100644 --- a/include/drm/ttm/ttm_memory.h +++ b/include/drm/ttm/ttm_memory.h @@ -30,6 +30,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index f4ff882cb2da..42c471afc52a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -217,6 +217,7 @@ struct atm_cirange { #include /* wait_queue_head_t */ #include /* struct timeval */ #include +#include #include /* struct sk_buff */ #include #include diff --git a/include/linux/bio.h b/include/linux/bio.h index 129a9c097958..f54db088f335 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index ac4d9f8b52e9..3b5bafce4337 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * bit-based spin_lock() diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index c5b6939fb32a..220ae21e819b 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -1,8 +1,9 @@ #ifndef __CEPH_DECODE_H #define __CEPH_DECODE_H -#include +#include #include +#include #include "types.h" diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 95bd8502e715..e8cf0ccd1a8d 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 4c5cb0880bba..9935fac8c107 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MDSMAP_H #define _FS_CEPH_MDSMAP_H +#include #include "types.h" /* diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 4f7a63237471..7b9b75a529be 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -9,6 +9,7 @@ #include #include #include +#include typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 8a94217b298e..d870bae81df1 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 5033fb88c107..94f20c1488a1 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -3,6 +3,7 @@ #include #include +#include #include struct task_struct; diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 679b349d9b66..a5966f691ef8 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 394a3e0e4a6b..0698c79fbcb2 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -6,6 +6,7 @@ #include #ifdef __KERNEL__ #include +#include #endif #include #include diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index f957085d40ed..f5a84eef6ed2 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -18,6 +18,7 @@ #include #include +#include /* * The second extended filesystem constants/structures diff --git a/include/linux/fs.h b/include/linux/fs.h index 69cd5bb640f5..abc92db51e54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -389,6 +389,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include #include diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 2a53f10712b3..a6dfe6944564 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * fsnotify_d_instantiate - instantiate a dentry for inode diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 38ac48b7d3a8..ed5a46707ad0 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -34,6 +34,7 @@ struct gpio { #include #include #include +#include struct device; struct gpio_chip; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3a93f73a8acc..6ede661e5b8e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/i2o.h b/include/linux/i2o.h index a6deef4f4f67..d23c3c20b201 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -24,6 +24,7 @@ #define I2O_MAX_DRIVERS 8 #include +#include #include #include #include diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 13aff1e2183b..82097f39df10 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -17,6 +17,7 @@ #include #include #include +#include #define VLAN_HLEN 4 /* The additional bytes (on top of the Ethernet header) * that VLAN requires. diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e44e84f0156c..657fab4efab3 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -20,6 +20,7 @@ #include #include +#include #include #include diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index dce6e4dbeda7..b6e1f8c00577 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 900c76337e8f..ca1b153585d3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 0b8e2a742600..910550f3b70e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -4,6 +4,7 @@ #include #include #include +#include struct page; struct zone; diff --git a/include/linux/mm.h b/include/linux/mm.h index 17b27cd269c4..b7fac5b6acb6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -6,6 +6,7 @@ #ifdef __KERNEL__ #include +#include #include #include #include diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index d5d2ec6494bb..37ef6b194089 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..5820638193f5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -33,6 +33,7 @@ #ifdef __KERNEL__ #include #include +#include #include #include #include diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 7454ad7451b4..89bd4a4dcfb4 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -41,6 +41,7 @@ #include #include #include +#include #define NILFS_INODE_BMAP_SIZE 7 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e90a673be67e..3cfa3ad94b1f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -6,6 +6,7 @@ #define PAGE_FLAGS_H #include +#include #ifndef __GENERATING_BOUNDS_H #include #include diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index e7cf6669ac34..f5bd679be46b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -2,6 +2,7 @@ #define _LINUX_PID_NS_H #include +#include #include #include #include diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index b7681102a4b9..11bad91c4433 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H +#include #include #include diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index c2f1f6a5fcb8..753ee8b62335 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -113,6 +113,7 @@ #include /* For unlikely. */ #include /* For struct task_struct. */ #include /* for IS_ERR_VALUE */ +#include /* For BUG_ON. */ extern long arch_ptrace(struct task_struct *child, long request, diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 07e360b1b282..e9a48234e693 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -22,6 +22,7 @@ #include #include +#include #include #include diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81c04f4348ec..3b657f2bed4a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #ifdef CONFIG_RCU_TORTURE_TEST diff --git a/include/linux/regset.h b/include/linux/regset.h index 8abee6556223..6325e099105a 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -15,6 +15,7 @@ #include #include +#include #include struct task_struct; struct user_regset; diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 2213ddcce20c..6643fb031293 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/relay.h b/include/linux/relay.h index a822fd71fd64..91cacc34c159 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 9aaf5bfdad1a..ac9586dadfa5 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -1,10 +1,12 @@ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H +#include +#include +#include + #include #include -#include -#include #include struct sg_table { diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 44f1514b00ba..5ff2df6c8217 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50db9b04a552..773ae985ec76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a32bcfdc7834..ca122b36aec1 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -8,6 +8,7 @@ */ #include #include +#include #include #include diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index eba52a100533..6b05dcd927ff 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -2,6 +2,7 @@ #define LINUX_SSB_DRIVER_GIGE_H_ #include +#include #include #include diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 2189d3ffc85d..792d16d9cbc7 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -2,6 +2,7 @@ #define _LINUX_SWAPOPS_H #include +#include /* * swapcache pages are stored in the swapper_space radix tree. We want to diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8ec1153ff57b..3de3acb84a95 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -68,6 +68,7 @@ struct file_handle; #include #include #include +#include #include #include #include diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h index 9ae8da3e6407..11087cdd4ad3 100644 --- a/include/linux/transport_class.h +++ b/include/linux/transport_class.h @@ -10,6 +10,7 @@ #define _TRANSPORT_CLASS_H_ #include +#include #include struct transport_container; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 5206d6541da5..7323a3390206 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -53,6 +53,7 @@ #ifdef __KERNEL__ #include +#include #include /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a067d30ce73e..85b44ca54ac6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/net/dst.h b/include/net/dst.h index 344c8dd02874..59c5d18cc385 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ebe517f2da9f..2bdee51ba30d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -16,6 +16,7 @@ #include /* for struct atomic_t */ #include #include +#include #include #include /* for union nf_inet_addr */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d49928ba5d09..8294f44c425a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -13,6 +13,7 @@ #ifndef MAC80211_H #define MAC80211_H +#include #include #include #include diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index d55f43443335..0931618c0f7f 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -5,6 +5,7 @@ #ifndef __NET_GENERIC_H__ #define __NET_GENERIC_H__ +#include #include /* diff --git a/include/net/red.h b/include/net/red.h index 28068ec614b2..77d4c3745cb5 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -2,6 +2,7 @@ #define __NET_SCHED_RED_H #include +#include #include #include #include diff --git a/include/net/tcp.h b/include/net/tcp.h index 42c29bfbcee3..ad8d0a865551 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 053b3cf2c66a..8d6689cb2c66 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -12,6 +12,7 @@ #define _TIMEWAIT_SOCK_H #include +#include #include struct timewait_sock_ops { diff --git a/include/net/udp.h b/include/net/udp.h index e39592f682c3..5d606d9da9e5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -23,6 +23,7 @@ #define _UDP_H #include +#include #include #include #include diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h index d86fffd3c03c..ff27f1b078d1 100644 --- a/include/net/wpan-phy.h +++ b/include/net/wpan-phy.h @@ -23,6 +23,7 @@ #include #include +#include struct wpan_phy { struct mutex pib_lock; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index f05fa826f89e..a5f9b960dfc8 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -26,6 +26,7 @@ #include #include #include +#include struct ore_comp { struct osd_obj_id obj; diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h index 0de32cd4e8a7..af244f4bba53 100644 --- a/include/scsi/scsi_transport.h +++ b/include/scsi/scsi_transport.h @@ -22,6 +22,7 @@ #include #include +#include #include #include -- cgit v1.2.3 From 901b04450a0ff44d579158b8b0492ce7e66cd442 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 3 Mar 2012 19:27:27 +0800 Subject: x86/numa: Improve internode cache alignment Currently cache alignment among nodes in the kernel is still 128 bytes on x86 NUMA machines - we got that X86_INTERNODE_CACHE_SHIFT default from old P4 processors. But now most modern x86 CPUs use the same size: 64 bytes from L1 to last level L3. so let's remove the incorrect setting, and directly use the L1 cache size to do SMP cache line alignment. This patch saves some memory space on kernel data, and it also improves the cache locality of kernel data. The System.map is quite different with/without this change: before patch after patch ... 000000000000b000 d tlb_vector_| 000000000000b000 d tlb_vector 000000000000b080 d cpu_loops_p| 000000000000b040 d cpu_loops_ ... Signed-off-by: Alex Shi Cc: asit.k.mallick@intel.com Link: http://lkml.kernel.org/r/1330774047-18597-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 3c57033e2211..6443c6f038e8 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -303,7 +303,6 @@ config X86_GENERIC config X86_INTERNODE_CACHE_SHIFT int default "12" if X86_VSMP - default "7" if NUMA default X86_L1_CACHE_SHIFT config X86_CMPXCHG -- cgit v1.2.3 From 37178b8bf00137dbf28a9b291af4fbc1b8f91dcc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 Nov 2011 14:02:45 +0900 Subject: KVM: MMU: Remove for_each_unsync_children() macro There is only one user of it and for_each_set_bit() does the same. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 224b02c3cda9..8a9b27cb4449 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1391,11 +1391,6 @@ struct kvm_mmu_pages { unsigned int nr; }; -#define for_each_unsync_children(bitmap, idx) \ - for (idx = find_first_bit(bitmap, 512); \ - idx < 512; \ - idx = find_next_bit(bitmap, 512, idx+1)) - static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { @@ -1417,7 +1412,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, { int i, ret, nr_unsync_leaf = 0; - for_each_unsync_children(sp->unsync_child_bitmap, i) { + for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; -- cgit v1.2.3 From 6addd1aa2ca28c054820ef2966ad372f118c3f31 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 Nov 2011 14:03:36 +0900 Subject: KVM: MMU: Add missing large page accounting to drop_large_spte() Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8a9b27cb4449..9270e0d93c31 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1798,6 +1798,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (is_large_pte(*sptep)) { drop_spte(vcpu->kvm, sptep); + --vcpu->kvm->stat.lpages; kvm_flush_remote_tlbs(vcpu->kvm); } } -- cgit v1.2.3 From a138fe7535c0ec778465c7b54b1aaaf4cfd885b7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Dec 2011 18:18:10 +0800 Subject: KVM: MMU: remove the redundant get_written_sptes get_written_sptes is called twice in kvm_mmu_pte_write, one of them can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9270e0d93c31..34da43086952 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3551,7 +3551,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ -static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) +static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because @@ -3660,10 +3660,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - spte = get_written_sptes(sp, gpa, &npte); - if (detect_write_misaligned(sp, gpa, bytes) || - detect_write_flooding(sp, spte)) { + detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; -- cgit v1.2.3 From e08b96371625aaa84cb03f51acc4c8e0be27403a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:20 +0100 Subject: KVM: s390: add parameter for KVM_CREATE_VM This patch introduces a new config option for user controlled kernel virtual machines. It introduces a parameter to KVM_CREATE_VM that allows to set bits that alter the capabilities of the newly created virtual machine. The parameter is passed to kvm_arch_init_vm for all architectures. The only valid modifier bit for now is KVM_VM_S390_UCONTROL. This requires CAP_SYS_ADMIN privileges and creates a user controlled virtual machine on s390 architectures. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 7 ++++++- arch/ia64/kvm/kvm-ia64.c | 5 ++++- arch/powerpc/kvm/powerpc.c | 5 ++++- arch/s390/kvm/Kconfig | 9 +++++++++ arch/s390/kvm/kvm-s390.c | 24 +++++++++++++++++++----- arch/s390/kvm/kvm-s390.h | 10 ++++++++++ arch/x86/kvm/x86.c | 5 ++++- include/linux/kvm.h | 3 +++ include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 13 +++++-------- 10 files changed, 65 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e1d94bf4056e..579d40b26a5a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -95,7 +95,7 @@ described as 'basic' will be available. Capability: basic Architectures: all Type: system ioctl -Parameters: none +Parameters: machine type identifier (KVM_VM_*) Returns: a VM fd that can be used to control the new virtual machine. The new VM has no virtual cpus and no memory. An mmap() of a VM fd @@ -103,6 +103,11 @@ will access the virtual machine's physical address space; offset zero corresponds to guest physical address zero. Use of mmap() on a VM fd is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is available. +You most certainly want to use 0 as machine type. + +In order to create user controlled virtual machines on S390, check +KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as +privileged user (CAP_SYS_ADMIN). 4.3 KVM_GET_MSR_INDEX_LIST diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 405052002493..df6b14194051 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -809,10 +809,13 @@ static void kvm_build_io_pmt(struct kvm *kvm) #define GUEST_PHYSICAL_RR4 0x2739 #define VMM_INIT_RR 0x1660 -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { BUG_ON(!kvm); + if (type) + return -EINVAL; + kvm->arch.is_sn2 = ia64_platform_is("sn2"); kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 607fbdf24b84..83f244569874 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -171,8 +171,11 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + return kvmppc_core_init_vm(kvm); } diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index a21634173a66..78eb9847008f 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -34,6 +34,15 @@ config KVM If unsure, say N. +config KVM_S390_UCONTROL + bool "Userspace controlled virtual machines" + depends on KVM + ---help--- + Allow CAP_SYS_ADMIN users to create KVM virtual machines that are + controlled by userspace. + + If unsure, say N. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d1c445732451..f0937552175b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -171,11 +171,22 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int rc; char debug_name[16]; + rc = -EINVAL; +#ifdef CONFIG_KVM_S390_UCONTROL + if (type & ~KVM_VM_S390_UCONTROL) + goto out_err; + if ((type & KVM_VM_S390_UCONTROL) && (!capable(CAP_SYS_ADMIN))) + goto out_err; +#else + if (type) + goto out_err; +#endif + rc = s390_enable_sie(); if (rc) goto out_err; @@ -198,10 +209,13 @@ int kvm_arch_init_vm(struct kvm *kvm) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "%s", "vm created"); - kvm->arch.gmap = gmap_alloc(current->mm); - if (!kvm->arch.gmap) - goto out_nogmap; - + if (type & KVM_VM_S390_UCONTROL) { + kvm->arch.gmap = NULL; + } else { + kvm->arch.gmap = gmap_alloc(current->mm); + if (!kvm->arch.gmap) + goto out_nogmap; + } return 0; out_nogmap: debug_unregister(kvm->arch.dbf); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 99b0b7597115..45b236a7c730 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -47,6 +47,16 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu) return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOP_INT; } +static inline int kvm_is_ucontrol(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if (kvm->arch.gmap) + return 0; + return 1; +#else + return 0; +#endif +} int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); void kvm_s390_tasklet(unsigned long parm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..06925b4bcc27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6031,8 +6031,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 68e67e50d028..bba393a6760f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -431,6 +431,9 @@ struct kvm_ppc_pvinfo { #define KVMIO 0xAE +/* machine type bits, to be used as argument to KVM_CREATE_VM */ +#define KVM_VM_S390_UCONTROL 1 + /* * ioctls for /dev/kvm fds: */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 900c76337e8f..82375e145e64 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -520,7 +520,7 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif -int kvm_arch_init_vm(struct kvm *kvm); +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a91f980077d8..32e3b048a6cf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -449,7 +449,7 @@ static void kvm_init_memslots_id(struct kvm *kvm) slots->id_to_index[i] = slots->memslots[i].id = i; } -static struct kvm *kvm_create_vm(void) +static struct kvm *kvm_create_vm(unsigned long type) { int r, i; struct kvm *kvm = kvm_arch_alloc_vm(); @@ -457,7 +457,7 @@ static struct kvm *kvm_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - r = kvm_arch_init_vm(kvm); + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_nodisable; @@ -2198,12 +2198,12 @@ static struct file_operations kvm_vm_fops = { .llseek = noop_llseek, }; -static int kvm_dev_ioctl_create_vm(void) +static int kvm_dev_ioctl_create_vm(unsigned long type) { int r; struct kvm *kvm; - kvm = kvm_create_vm(); + kvm = kvm_create_vm(type); if (IS_ERR(kvm)) return PTR_ERR(kvm); #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET @@ -2254,10 +2254,7 @@ static long kvm_dev_ioctl(struct file *filp, r = KVM_API_VERSION; break; case KVM_CREATE_VM: - r = -EINVAL; - if (arg) - goto out; - r = kvm_dev_ioctl_create_vm(); + r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: r = kvm_dev_ioctl_check_extension_generic(arg); -- cgit v1.2.3 From 5b1c1493afe8d69909f9df3221bb2fffdf479f4a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:23 +0100 Subject: KVM: s390: ucontrol: export SIE control block to user This patch exports the s390 SIE hardware control block to userspace via the mapping of the vcpu file descriptor. In order to do so, a new arch callback named kvm_arch_vcpu_fault is introduced for all architectures. It allows to map architecture specific pages. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 5 +++++ arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 5 +++++ include/linux/kvm.h | 2 ++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 2 +- 8 files changed, 37 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6e53ff51422f..5ebf47d99e56 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -218,6 +218,11 @@ allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index df6b14194051..8ca7261e7b3d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1566,6 +1566,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 83f244569874..a5671616af86 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -659,6 +659,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { u32 inst_lis = 0x3c000000; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index af05328aca25..d6bc65aeb950 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -763,6 +763,19 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET) + && (kvm_is_ucontrol(vcpu->kvm))) { + vmf->page = virt_to_page(vcpu->arch.sie_block); + get_page(vmf->page); + return 0; + } +#endif + return VM_FAULT_SIGBUS; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06925b4bcc27..a3ce196d21fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2814,6 +2814,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7f686f6708b0..8f888df206a2 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -440,6 +440,8 @@ struct kvm_ppc_pvinfo { /* machine type bits, to be used as argument to KVM_CREATE_VM */ #define KVM_VM_S390_UCONTROL 1 +#define KVM_S390_SIE_PAGE_OFFSET 1 + /* * ioctls for /dev/kvm fds: */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82375e145e64..d4d4d7092110 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -450,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_dev_ioctl_check_extension(long ext); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32e3b048a6cf..64be836f3348 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1657,7 +1657,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif else - return VM_FAULT_SIGBUS; + return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); vmf->page = page; return 0; -- cgit v1.2.3 From 4a58ae614a28b1ae3bea1c74a307cdfb7c77dab8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 6 Jan 2012 15:06:18 +0100 Subject: KVM: MMU: unnecessary NX state assignment We can remove the first ->nx state assignment since it is assigned afterwards anyways. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 34da43086952..0a11468d853f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3321,7 +3321,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_cr3 = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; - context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { context->nx = false; -- cgit v1.2.3 From 2b036c6b861dc5da295c6fe19a3edcff7093fdeb Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 9 Jan 2012 14:00:35 -0500 Subject: KVM: SVM: Add support for AMD's OSVW feature in guests In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 +++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++ arch/x86/kvm/svm.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d6640a5ca1..bd69c93da8fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -478,6 +478,12 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; } apf; + + /* OSVW MSRs (AMD only) */ + struct { + u64 length; + u64 status; + } osvw; }; struct kvm_arch { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca5..9fed5bedaad6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6d..26d1fb437eb5 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe56..fce3ba0f2079 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -110,6 +110,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -556,6 +562,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -620,6 +647,36 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); return 0; @@ -1186,6 +1243,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; free_page4: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3ce196d21fe..2bd77a3a41ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1675,6 +1675,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) */ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1959,6 +1969,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); -- cgit v1.2.3 From b9e5dc8d4511e6a00862a795319569e7fe7f60f4 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:30 +0100 Subject: KVM: provide synchronous registers in kvm_run On some cpus the overhead for virtualization instructions is in the same range as a system call. Having to call multiple ioctls to get set registers will make certain userspace handled exits more expensive than necessary. Lets provide a section in kvm_run that works as a shared save area for guest registers. We also provide two 64bit flags fields (architecture specific), that will specify 1. which parts of these fields are valid. 2. which registers were modified by userspace Each bit for these flag fields will define a group of registers (like general purpose) or a single register. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 23 +++++++++++++++++++++++ arch/ia64/include/asm/kvm.h | 4 ++++ arch/powerpc/include/asm/kvm.h | 4 ++++ arch/s390/include/asm/kvm.h | 3 +++ arch/x86/include/asm/kvm.h | 4 ++++ include/linux/kvm.h | 15 +++++++++++++++ 6 files changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a67fb35993fa..7ca696227d3a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1771,6 +1771,29 @@ developer registration required to access it). /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit + for general purpose registers) + }; 6. Capabilities that can be enabled diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index bc90c75adf67..b9f82c84f093 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -261,4 +261,8 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f7727d91ac6b..7d9d4de057ef 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -265,6 +265,10 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #define KVM_REG_MASK 0x001f #define KVM_REG_EXT_MASK 0xffe0 #define KVM_REG_GPR 0x0000 diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 82b32a100c7d..325560afb77e 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -41,4 +41,7 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; #endif diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4d8dcbdfc120..e7d1c194d272 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -321,4 +321,8 @@ struct kvm_xcrs { __u64 padding[16]; }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6cf048d9604b..245bcb3a0fcd 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -279,6 +279,20 @@ struct kvm_run { /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; }; /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ @@ -570,6 +584,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 +#define KVM_CAP_SYNC_REGS 74 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 28867cee754c07b3fa0a679ed2ea394843130217 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Jan 2012 15:08:44 +0200 Subject: KVM: x86 emulator: add 8-bit memory operands Useful for MOVSX/MOVZX. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0982507b962a..5da6b3619201 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -57,6 +57,7 @@ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ +#define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -101,6 +102,7 @@ #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) #define SrcDX (OpDX << SrcShift) +#define SrcMem8 (OpMem8 << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -3656,6 +3658,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpMem8: + ctxt->memop.bytes = 1; + goto mem_common; case OpMem16: ctxt->memop.bytes = 2; goto mem_common; -- cgit v1.2.3 From 2adb5ad9fe1b44d0ae8b00d2bd6568e6163215b3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Jan 2012 15:08:45 +0200 Subject: KVM: x86 emulator: Remove byte-sized MOVSX/MOVZX hack Currently we treat MOVSX/MOVZX with a byte source as a byte instruction, and change the destination operand size with a hack. Change it to be a word instruction, so the destination receives its natural size, and change the source to be SrcMem8. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5da6b3619201..6eaedac7cf6a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -860,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, } static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op, - int inhibit_bytereg) + struct operand *op) { unsigned reg = ctxt->modrm_reg; int highbyte_regs = ctxt->rex_prefix == 0; @@ -878,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if ((ctxt->d & ByteOp) && !inhibit_bytereg) { + if (ctxt->d & ByteOp) { op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); op->bytes = 1; } else { @@ -3516,13 +3515,13 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), @@ -3604,9 +3603,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: - decode_register_operand(ctxt, op, - op == &ctxt->dst && - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + decode_register_operand(ctxt, op); break; case OpImmUByte: rc = decode_imm(ctxt, op, 1, false); -- cgit v1.2.3 From 3ea8b75e47ac70bdd0a2c0492102682d43bfa3c4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:50:08 +0900 Subject: KVM: MMU: Remove unused kvm_pte_chain Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd69c93da8fa..461016614324 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; - /* * kvm_mmu_page_role, below, is defined as: * -- cgit v1.2.3 From 9373e2c0576ee15b13e93bc5c5b3ef31d0612992 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:51:20 +0900 Subject: KVM: MMU: Remove unused kvm parameter from __gfn_to_rmap() Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0a11468d853f..75b8f579b2a6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -946,7 +946,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } -static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, +static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { struct kvm_lpage_info *linfo; @@ -966,7 +966,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) struct kvm_memory_slot *slot; slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(kvm, gfn, level, slot); + return __gfn_to_rmap(gfn, level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1018,7 +1018,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, u64 *spte; int i, write_protected = 0; - rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); + rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); @@ -1033,7 +1033,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(kvm, gfn, i, slot); + rmapp = __gfn_to_rmap(gfn, i, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); -- cgit v1.2.3 From e4b35cc960bf216548516d8e39f5e364cfbbc86b Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:52:15 +0900 Subject: KVM: MMU: Remove unused kvm parameter from rmap_next() Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 26 +++++++++++++------------- arch/x86/kvm/mmu_audit.c | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 75b8f579b2a6..ae76cc3392e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -988,7 +988,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +static u64 *rmap_next(unsigned long *rmapp, u64 *spte) { return pte_list_next(rmapp, spte); } @@ -1019,7 +1019,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, int i, write_protected = 0; rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); @@ -1027,14 +1027,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!is_large_pte(*spte)); @@ -1045,7 +1045,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, spte = NULL; write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } @@ -1066,7 +1066,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int need_tlb_flush = 0; - while ((spte = rmap_next(kvm, rmapp, NULL))) { + while ((spte = rmap_next(rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); drop_spte(kvm, spte); @@ -1085,14 +1085,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!is_shadow_present_pte(*spte)); rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { drop_spte(kvm, spte); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1102,7 +1102,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~shadow_accessed_mask; mmu_spte_clear_track_bits(spte); mmu_spte_set(spte, new_spte); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } if (need_flush) @@ -1176,7 +1176,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { int _young; u64 _spte = *spte; @@ -1186,7 +1186,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } return young; } @@ -1205,7 +1205,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { u64 _spte = *spte; BUG_ON(!(_spte & PT_PRESENT_MASK)); @@ -1214,7 +1214,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; break; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } out: return young; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..6eabae3d77ff 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { if (is_writable_pte(*spte)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } -- cgit v1.2.3 From e2358851efbcdc34583ee11971a6e4d587ea8bf9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Jan 2012 14:09:50 +0100 Subject: KVM: SVM: comment nested paging and virtualization module parameters Also use true instead of 1 for enabling by default. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fce3ba0f2079..7bbd17cc3488 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -182,11 +182,13 @@ static bool npt_enabled = true; #else static bool npt_enabled; #endif -static int npt = 1; +/* allow nested paging (virtualized MMU) for all guests */ +static int npt = true; module_param(npt, int, S_IRUGO); -static int nested = 1; +/* allow nested virtualization in KVM/SVM */ +static int nested = true; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From a52315e1d549dad80ff443151927226c11fd8c2b Mon Sep 17 00:00:00 2001 From: Julian Stecklina Date: Mon, 16 Jan 2012 14:02:20 +0100 Subject: KVM: Don't mistreat edge-triggered INIT IPI as INIT de-assert. (LAPIC) If the guest programs an IPI with level=0 (de-assert) and trig_mode=0 (edge), it is erroneously treated as INIT de-assert and ignored, but to quote the spec: "For this delivery mode [INIT de-assert], the level flag must be set to 0 and trigger mode flag to 1." Signed-off-by: Julian Stecklina Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cfdc6e0ef002..3ee1d83c695d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_INIT: - if (level) { + if (!trig_mode || level) { result = 1; vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.2.3 From 1a18a69b762374c423305772500f36eb8984ca52 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Feb 2012 12:23:21 +0200 Subject: KVM: x86 emulator: reject SYSENTER in compatibility mode on AMD guests If the guest thinks it's an AMD, it will not have prepared the SYSENTER MSRs, and if the guest executes SYSENTER in compatibility mode, it will fails. Detect this condition and #UD instead, like the spec says. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6eaedac7cf6a..71450aca3b86 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1892,6 +1892,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } +static bool vendor_intel(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ecx = 0; + return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) + && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx + && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; +} + static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2008,6 +2019,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); + /* + * Not recognized on AMD in compat mode (but is recognized in legacy + * mode). + */ + if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) + && !vendor_intel(ctxt)) + return emulate_ud(ctxt); + /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ -- cgit v1.2.3 From 242ec97c358256ad6e62dab869f63a03cd244122 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 24 Jan 2012 15:06:05 +0200 Subject: KVM: x86: reset edge sense circuit of i8259 on init The spec says that during initialization "The edge sense circuit is reset which means that following initialization an interrupt request (IR) input must make a low-to-high transition to generate an interrupt", but currently if edge triggered interrupt is in IRR it is delivered after i8259 initialization. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b6a73537e1ef..81cf4fa4a2be 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x10) { s->init4 = val & 1; s->last_irr = 0; + s->irr &= s->elcr; s->imr = 0; s->priority_add = 0; s->special_mask = 0; -- cgit v1.2.3 From df156f90a0f90649dd38b7667901ef85478f3d2b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 7 Feb 2012 15:52:44 +0100 Subject: x86: Introduce x86_cpuinit.early_percpu_clock_init hook When kvm guest uses kvmclock, it may hang on vcpu hot-plug. This is caused by an overflow in pvclock_get_nsec_offset, u64 delta = tsc - shadow->tsc_timestamp; which in turn is caused by an undefined values from percpu hv_clock that hasn't been initialized yet. Uninitialized clock on being booted cpu is accessed from start_secondary -> smp_callin -> smp_store_cpu_info -> identify_secondary_cpu -> mtrr_ap_init -> mtrr_restore -> stop_machine_from_inactive_cpu -> queue_stop_cpus_work ... -> sched_clock -> kvm_clock_read which is well before x86_cpuinit.setup_percpu_clockev call in start_secondary, where percpu clock is initialized. This patch introduces a hook that allows to setup/initialize per_cpu clock early and avoid overflow due to reading - undefined values - old values if cpu was offlined and then onlined again Another possible early user of this clock source is ftrace that accesses it to get timestamps for ring buffer entries. So if mtrr_ap_init is moved from identify_secondary_cpu to past x86_cpuinit.setup_percpu_clockev in start_secondary, ftrace may cause the same overflow/hang on cpu hot-plug anyway. More complete description of the problem: https://lkml.org/lkml/2012/2/2/101 Credits to Marcelo Tosatti for hook idea. Acked-by: Thomas Gleixner Signed-off-by: Igor Mammedov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/kvmclock.c | 4 +--- arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 517d4767ffdd..5d0afac2962c 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b29..ca4e735adc54 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -144,8 +144,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,7 +192,7 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif machine_ops.shutdown = kvm_shutdown; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..a05d6fd5e06d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc673..6f2ec53deed0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .fixup_cpu_id = x86_default_fixup_cpu_id, }; -- cgit v1.2.3 From a59cb29e4d81e025192550c2703f305637f016f6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 12:28:31 -0200 Subject: KVM: x86: increase recommended max vcpus to 160 Increase recommended max vcpus from 64 to 160 (tested internally at Red Hat). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 461016614324..782d973b0719 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -29,7 +29,7 @@ #include #define KVM_MAX_VCPUS 254 -#define KVM_SOFT_MAX_VCPUS 64 +#define KVM_SOFT_MAX_VCPUS 160 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 -- cgit v1.2.3 From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:51 +0100 Subject: perf: Add generic taken branch sampling support This patch adds the ability to sample taken branches to the perf_event interface. The ability to capture taken branches is very useful for all sorts of analysis. For instance, basic block profiling, call counts, statistical call graph. This new capability requires hardware assist and as such may not be available on all HW platforms. On Intel x86 it is implemented on top of the Last Branch Record (LBR) facility. To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK bit must be set in attr->sample_type. Sampled taken branches may be filtered by type and/or priv levels. The patch adds a new field, called branch_sample_type, to the perf_event_attr structure. It contains a bitmask of filters to apply to the sampled taken branches. Filters may be implemented in HW. If the HW filter does not exist or is not good enough, some arch may also implement a SW filter. The following generic filters are currently defined: - PERF_SAMPLE_USER only branches whose targets are at the user level - PERF_SAMPLE_KERNEL only branches whose targets are at the kernel level - PERF_SAMPLE_HV only branches whose targets are at the hypervisor level - PERF_SAMPLE_ANY any type of branches (subject to priv levels filters) - PERF_SAMPLE_ANY_CALL any call branches (may incl. syscall on some arch) - PERF_SAMPLE_ANY_RET any return branches (may incl. syscall returns on some arch) - PERF_SAMPLE_IND_CALL indirect call branches Obviously filter may be combined. The priv level bits are optional. If not provided, the priv level of the associated event are used. It is possible to collect branches at a priv level different from the associated event. Use of kernel, hv priv levels is subject to permissions and availability (hv). The number of taken branch records present in each sample may vary based on HW, the type of sampled branches, the executed code. Therefore each sample contains the number of taken branches it contains. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 +++++---- include/linux/perf_event.h | 71 ++++++++++++++++++++++++++++-- kernel/events/core.c | 68 ++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 47a7e63bfe54..309d0cc69163 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -142,9 +142,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].flags = 0; + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } @@ -165,19 +167,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, flags = 0; + u64 from, to, mis = 0, pred = 0; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); if (lbr_format == LBR_FORMAT_EIP_FLAGS) { - flags = !!(from & LBR_FROM_FLAG_MISPRED); + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; from = (u64)((((s64)from) << 1) >> 1); } - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].flags = flags; + cpuc->lbr_entries[i].from = from; + cpuc->lbr_entries[i].to = to; + cpuc->lbr_entries[i].mispred = mis; + cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64426b71381f..5fc494f4a094 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,10 +129,39 @@ enum perf_event_sample_format { PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ }; +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + + PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ +}; + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -240,6 +269,7 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; + __u64 branch_sample_type; /* enum branch_sample_type */ }; /* @@ -458,6 +488,8 @@ enum perf_event_type { * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK * }; */ PERF_RECORD_SAMPLE = 9, @@ -530,12 +562,34 @@ struct perf_raw_record { void *data; }; +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + */ struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 flags; + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + reserved:62; }; +/* + * branch stack layout: + * nr: number of taken branches stored in entries[] + * + * Note that nr can vary from sample to sample + * branches (to, from) are stored from most recent + * to least recent, i.e., entries[0] contains the most + * recent branch. + */ struct perf_branch_stack { __u64 nr; struct perf_branch_entry entries[0]; @@ -566,7 +620,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + struct hw_perf_event_extra extra_reg; + struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; @@ -1007,12 +1063,14 @@ struct perf_sample_data { u64 period; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; + struct perf_branch_stack *br_stack; }; static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; + data->br_stack = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/kernel/events/core.c b/kernel/events/core.c index e8b32ac75ce3..5820efdf47cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -3907,6 +3914,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3949,6 +3974,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -5935,6 +5969,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; -- cgit v1.2.3 From 225ce53910edc3c2322b1e4f2ed049a9196cd0b3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:52 +0100 Subject: perf/x86: Add Intel LBR MSR definitions This patch adds the LBR definitions for NHM/WSM/SNB and Core. It also adds the definitions for the architected LBR MSR: LBR_SELECT, LBRT_TOS. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a6962d9161a0..ccb805966f68 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -56,6 +56,13 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 309d0cc69163..6710a5116ebd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -203,23 +203,23 @@ void intel_pmu_lbr_read(void) void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x680; - x86_pmu.lbr_to = 0x6c0; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; } void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } -- cgit v1.2.3 From b36817e8863090f1f24e538106ca50fa1d9e4003 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:53 +0100 Subject: perf/x86: Add Intel LBR sharing logic The Intel LBR on some recent processor is capable of filtering branches by type. The filter is configurable via the LBR_SELECT MSR register. There are limitation on how this register can be used. On Nehalem/Westmere, the LBR_SELECT is shared by the two HT threads when HT is on. It is private to each core when HT is off. On SandyBridge, the LBR_SELECT register is private to each thread when HT is on. It is private to each core when HT is off. The kernel must manage the sharing of LBR_SELECT. It allows multiple users on the same logical CPU to use LBR_SELECT as long as they program it with the same value. Across sibling CPUs (HT threads), the same restriction applies on NHM/WSM. This patch implements this sharing logic by leveraging the mechanism put in place for managing the offcore_response shared MSR. We modify __intel_shared_reg_get_constraints() to cause x86_get_event_constraint() to be called because LBR may be associated with events that may be counter constrained. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++ arch/x86/kernel/cpu/perf_event.h | 4 ++ arch/x86/kernel/cpu/perf_event_intel.c | 70 +++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f8bddb5b0600..377931354ac7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -426,6 +426,10 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; + /* mark not used */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 82db83b5c3bc..9b9c580a7ab8 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -33,6 +33,7 @@ enum extra_reg_type { EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -130,6 +131,7 @@ struct cpu_hw_events { void *lbr_context; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; /* * Intel host/guest exclude bits @@ -342,6 +344,8 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ /* * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..97f7bb587519 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1123,17 +1123,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) + struct perf_event *event, + struct hw_perf_event_extra *reg) { struct event_constraint *c = &emptyconstraint; - struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) - return &unconstrained; + return NULL; /* call x86_get_event_constraint() */ again: era = &cpuc->shared_regs->regs[reg->idx]; @@ -1156,14 +1156,10 @@ again: reg->alloc = 1; /* - * All events using extra_reg are unconstrained. - * Avoids calling x86_get_event_constraints() - * - * Must revisit if extra_reg controlling events - * ever have constraints. Worst case we go through - * the regular event constraint table. + * need to call x86_get_event_constraint() + * to check if associated event has constraints */ - c = &unconstrained; + c = NULL; } else if (intel_try_alt_er(event, orig_idx)) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -1200,11 +1196,23 @@ static struct event_constraint * intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = NULL; - - if (event->hw.extra_reg.idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, event); - + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } return c; } @@ -1252,6 +1260,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, reg = &event->hw.extra_reg; if (reg->idx != EXTRA_REG_NONE) __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); } static void intel_put_event_constraints(struct cpu_hw_events *cpuc, @@ -1431,7 +1443,7 @@ static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!x86_pmu.extra_regs) + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) return NOTIFY_OK; cpuc->shared_regs = allocate_shared_regs(cpu); @@ -1453,22 +1465,28 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) return; - for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_shared_regs *pc; + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; - pc = per_cpu(cpu_hw_events, i).shared_regs; - if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online = cpuc->shared_regs; - cpuc->shared_regs = pc; - break; + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } - cpuc->shared_regs->core_id = core_id; - cpuc->shared_regs->refcnt++; + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; } static void intel_pmu_cpu_dying(int cpu) -- cgit v1.2.3 From ff3fb511ba377e8a0a7f553cc352237f70d08121 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:54 +0100 Subject: perf/x86: Sync branch stack sampling with precise_sampling If precise sampling is enabled on Intel x86 then perf_event uses PEBS. To correct for the off-by-one error of PEBS, perf_event uses LBR when precise_sample > 1. On Intel x86 PERF_SAMPLE_BRANCH_STACK is implemented using LBR, therefore both features must be coordinated as they may not configure LBR the same way. For PEBS, LBR needs to capture all branches at the priv level of the associated event. This patch checks that the branch type and priv level of BRANCH_STACK is compatible with that of the PEBS LBR requirement, thereby allowing: $ perf record -b any,u -e instructions:upp .... But: $ perf record -b any_call,u -e instructions:upp Is not possible. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 377931354ac7..cea567483274 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -353,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event) return 0; } +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { @@ -369,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } } /* -- cgit v1.2.3 From c5cc2cd906ea9fe73e3c93f9ad824996faa278cc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:55 +0100 Subject: perf/x86: Add Intel LBR mappings for PERF_SAMPLE_BRANCH filters This patch adds the mappings from the generic PERF_SAMPLE_BRANCH_* filters to the actual Intel x86LBR filters, whenever they exist. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-6-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_intel.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 103 ++++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 9b9c580a7ab8..4e948976aefb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -539,6 +539,8 @@ void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_snb(void); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 97f7bb587519..b0db01692441 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1757,7 +1757,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - intel_pmu_lbr_init_nhm(); + intel_pmu_lbr_init_snb(); x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 6710a5116ebd..e54a063b2863 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,6 +13,49 @@ enum { LBR_FORMAT_EIP_FLAGS = 0x03, }; +/* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ +#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT 2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT 5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ +#define LBR_FAR_BIT 8 /* do not capture far branches */ + +#define LBR_KERNEL (1 << LBR_KERNEL_BIT) +#define LBR_USER (1 << LBR_USER_BIT) +#define LBR_JCC (1 << LBR_JCC_BIT) +#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) +#define LBR_RETURN (1 << LBR_RETURN_BIT) +#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) +#define LBR_FAR (1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP -1 /* LBR filter not supported */ +#define LBR_IGN 0 /* ignored */ + +#define LBR_ANY \ + (LBR_JCC |\ + LBR_REL_CALL |\ + LBR_IND_CALL |\ + LBR_RETURN |\ + LBR_REL_JMP |\ + LBR_IND_JMP |\ + LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -151,8 +194,6 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_stack.nr = i; } -#define LBR_FROM_FLAG_MISPRED (1ULL << 63) - /* * Due to lack of segmentation in Linux the effective address (offset) * is the same as the linear address, allowing us to merge the LIP and EIP @@ -200,26 +241,84 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches + */ + [PERF_SAMPLE_BRANCH_ANY_CALL] = + LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include IND_JMP to capture IND_CALL + */ + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, +}; + +/* core */ void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + pr_cont("4-deep LBR, "); } +/* nehalem/westmere */ void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + pr_cont("16-deep LBR, "); } +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + pr_cont("16-deep LBR, "); +} + +/* atom */ void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + pr_cont("8-deep LBR, "); } -- cgit v1.2.3 From 88c9a65e13f393fd60d8b9e9c659a34f9e39967d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:56 +0100 Subject: perf/x86: Disable LBR support for older Intel Atom processors The patch adds a restriction for Intel Atom LBR support. Only steppings 10 (PineView) and more recent are supported. Older models do not have a functional LBR. Their LBR does not freeze on PMU interrupt which makes LBR unusable in the context of perf_events. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-7-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index e54a063b2863..07f0ff88e443 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -315,6 +315,16 @@ void intel_pmu_lbr_init_snb(void) /* atom */ void intel_pmu_lbr_init_atom(void) { + /* + * only models starting at stepping 10 seems + * to have an operational LBR which can freeze + * on PMU interrupt + */ + if (boot_cpu_data.x86_mask < 10) { + pr_cont("LBR disabled due to erratum"); + return; + } + x86_pmu.lbr_nr = 8; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_CORE_FROM; -- cgit v1.2.3 From 60ce0fbd072695866cb27b729690ab59dce705a5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:57 +0100 Subject: perf/x86: Implement PERF_SAMPLE_BRANCH for Intel CPUs This patch implements PERF_SAMPLE_BRANCH support for Intel x86processors. It connects PERF_SAMPLE_BRANCH to the actual LBR. The patch adds the hooks in the PMU irq handler to save the LBR on counter overflow for both regular and PEBS modes. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-8-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_intel.c | 35 ++++++++++++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 10 ++-- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 86 +++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4e948976aefb..ef7419cbd13d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -541,6 +541,8 @@ void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_snb(void); +int intel_pmu_setup_lbr_filter(struct perf_event *event); + int p4_pmu_init(void); int p6_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b0db01692441..7cc1e2dcc4dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -727,6 +727,19 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) + return true; + + return false; +} + static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -881,6 +894,13 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + /* + * must disable before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_disable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -935,6 +955,12 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_pmu_enable_bts(hwc->config); return; } + /* + * must enabled before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_enable(event); if (event->attr.exclude_host) cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); @@ -1057,6 +1083,9 @@ again: data.period = event->hw.last_period; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1305,6 +1334,12 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.config = alt_config; } + if (intel_pmu_needs_lbr_smpl(event)) { + ret = intel_pmu_setup_lbr_filter(event); + if (ret) + return ret; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index d6bd49faa40c..ee7e3c8d9d6a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,9 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_enable(event); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -454,9 +451,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); hwc->config |= ARCH_PERFMON_EVENTSEL_INT; - - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_disable(event); } void intel_pmu_pebs_enable_all(void) @@ -572,6 +566,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * both formats and we don't use the other fields in this * routine. */ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct pebs_record_core *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; @@ -602,6 +597,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 07f0ff88e443..d0fb864ff2b0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -56,6 +56,10 @@ enum { #define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define for_each_branch_sample_type(x) \ + for ((x) = PERF_SAMPLE_BRANCH_USER; \ + (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -64,6 +68,10 @@ enum { static void __intel_pmu_lbr_enable(void) { u64 debugctl; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_sel) + wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); @@ -119,7 +127,6 @@ void intel_pmu_lbr_enable(struct perf_event *event) * Reset the LBR stack if we changed task context to * avoid data leaks. */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { intel_pmu_lbr_reset(); cpuc->lbr_context = event->ctx; @@ -138,8 +145,11 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - if (cpuc->enabled && !cpuc->lbr_users) + if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); + /* avoid stale pointer */ + cpuc->lbr_context = NULL; + } } void intel_pmu_lbr_enable_all(void) @@ -158,6 +168,9 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } +/* + * TOS = most recently recorded branch + */ static inline u64 intel_pmu_lbr_tos(void) { u64 tos; @@ -241,6 +254,75 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_64(cpuc); } +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, m; + u64 v; + + for_each_branch_sample_type(m) { + if (!(br_type & m)) + continue; + + v = x86_pmu.lbr_sel_map[m]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + mask |= v; + + if (m == PERF_SAMPLE_BRANCH_ANY) + break; + } + reg = &event->hw.branch_reg; + reg->idx = EXTRA_REG_LBR; + + /* LBR_SELECT operates in suppress mode so invert mask */ + reg->config = ~mask & x86_pmu.lbr_sel_mask; + + return 0; +} + +/* + * all the bits supported on some flavor of x86LBR + * we ignore BRANCH_HV because it is not supported + */ +#define PERF_SAMPLE_BRANCH_X86_ALL \ + (PERF_SAMPLE_BRANCH_ANY |\ + PERF_SAMPLE_BRANCH_USER |\ + PERF_SAMPLE_BRANCH_KERNEL) + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + + /* + * no LBR on this PMU + */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* + * if no LBR HW filter, users can only + * capture all branches + */ + if (!x86_pmu.lbr_sel_map) { + if (br_type != PERF_SAMPLE_BRANCH_X86_ALL) + return -EOPNOTSUPP; + return 0; + } + /* + * we ignore branch priv levels we do not + * know about: BRANCH_HV + */ + + return intel_pmu_setup_hw_lbr_filter(event); +} + /* * Map interface branch filters onto LBR filters */ -- cgit v1.2.3 From 3e702ff6d1ea12dcf1c798ecb61e7f3a1579df42 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:58 +0100 Subject: perf/x86: Add LBR software filter support for Intel CPUs This patch adds an internal sofware filter to complement the (optional) LBR hardware filter. The software filter is necessary: - as a substitute when there is no HW LBR filter (e.g., Atom, Core) - to complement HW LBR filter in case of errata (e.g., Nehalem/Westmere) - to provide finer grain filtering (e.g., all processors) Sometimes the LBR HW filter cannot distinguish between two types of branches. For instance, to capture syscall as CALLS, it is necessary to enable the LBR_FAR filter which will also capture JMP instructions. Thus, a second pass is necessary to filter those out, this is what the SW filter can do. The SW filter is built on top of the internal x86 disassembler. It is a best effort filter especially for user level code. It is subject to the availability of the text page of the program. The SW filter is enabled on all Intel processors. It is bypassed when the user is capturing all branches at all priv levels. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-9-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 10 + arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 +- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 332 +++++++++++++++++++++++++++-- 3 files changed, 321 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ef7419cbd13d..f104c054dc5c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -132,6 +132,7 @@ struct cpu_hw_events { struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; + u64 br_sel; /* * Intel host/guest exclude bits @@ -459,6 +460,15 @@ extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ee7e3c8d9d6a..7f64df19e7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -3,6 +3,7 @@ #include #include +#include #include "perf_event.h" @@ -469,17 +470,6 @@ void intel_pmu_pebs_disable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } -#include - -static inline bool kernel_ip(unsigned long ip) -{ -#ifdef CONFIG_X86_32 - return ip > PAGE_OFFSET; -#else - return (long)ip < 0; -#endif -} - static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d0fb864ff2b0..520b4265fcd2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -3,6 +3,7 @@ #include #include +#include #include "perf_event.h" @@ -60,6 +61,53 @@ enum { for ((x) = PERF_SAMPLE_BRANCH_USER; \ (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -131,6 +179,7 @@ void intel_pmu_lbr_enable(struct perf_event *event) intel_pmu_lbr_reset(); cpuc->lbr_context = event->ctx; } + cpuc->br_sel = event->hw.branch_reg.reg; cpuc->lbr_users++; } @@ -252,6 +301,44 @@ void intel_pmu_lbr_read(void) intel_pmu_lbr_read_32(cpuc); else intel_pmu_lbr_read_64(cpuc); + + intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + int mask = 0; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* we ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + /* + * stash actual user request into reg, it may + * be used by fixup code for some CPU + */ + event->hw.branch_reg.reg = mask; } /* @@ -273,10 +360,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) v = x86_pmu.lbr_sel_map[m]; if (v == LBR_NOT_SUPP) return -EOPNOTSUPP; - mask |= v; - if (m == PERF_SAMPLE_BRANCH_ANY) - break; + if (v != LBR_IGN) + mask |= v; } reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; @@ -287,18 +373,9 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) return 0; } -/* - * all the bits supported on some flavor of x86LBR - * we ignore BRANCH_HV because it is not supported - */ -#define PERF_SAMPLE_BRANCH_X86_ALL \ - (PERF_SAMPLE_BRANCH_ANY |\ - PERF_SAMPLE_BRANCH_USER |\ - PERF_SAMPLE_BRANCH_KERNEL) - int intel_pmu_setup_lbr_filter(struct perf_event *event) { - u64 br_type = event->attr.branch_sample_type; + int ret = 0; /* * no LBR on this PMU @@ -307,20 +384,210 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return -EOPNOTSUPP; /* - * if no LBR HW filter, users can only - * capture all branches + * setup SW LBR filter */ - if (!x86_pmu.lbr_sel_map) { - if (br_type != PERF_SAMPLE_BRANCH_X86_ALL) - return -EOPNOTSUPP; - return 0; + intel_pmu_setup_sw_lbr_filter(event); + + /* + * setup HW LBR filter, if any + */ + if (x86_pmu.lbr_sel_map) + ret = intel_pmu_setup_hw_lbr_filter(event); + + return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to) +{ + struct insn insn; + void *addr; + int bytes, size = MAX_INSN_SIZE; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes = copy_from_user_nmi(buf, (void __user *)from, size); + if (bytes != size) + return X86_BR_NONE; + + addr = buf; + } else + addr = (void *)from; + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, addr, is64); + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + insn_get_modrm(&insn); + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; } /* - * we ignore branch priv levels we do not - * know about: BRANCH_HV + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented */ + if (ret != X86_BR_NONE) + ret |= to_plm; - return intel_pmu_setup_hw_lbr_filter(event); + return ret; +} + +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ + u64 from, to; + int br_sel = cpuc->br_sel; + int i, j, type; + bool compress = false; + + /* if sampling all branches, then nothing to filter */ + if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + + type = branch_type(from, to); + + /* if type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; + compress = true; + } + } + + if (!compress) + return; + + /* remove all entries with from=0 */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } } /* @@ -363,6 +630,10 @@ void intel_pmu_lbr_init_core(void) x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ pr_cont("4-deep LBR, "); } @@ -377,6 +648,13 @@ void intel_pmu_lbr_init_nhm(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + /* + * SW branch filter usage: + * - workaround LBR_SEL errata (see above) + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ pr_cont("16-deep LBR, "); } @@ -391,6 +669,12 @@ void intel_pmu_lbr_init_snb(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ pr_cont("16-deep LBR, "); } @@ -412,5 +696,9 @@ void intel_pmu_lbr_init_atom(void) x86_pmu.lbr_from = MSR_LBR_CORE_FROM; x86_pmu.lbr_to = MSR_LBR_CORE_TO; + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ pr_cont("8-deep LBR, "); } -- cgit v1.2.3 From 2481c5fa6db0237e4f0168f88913178b2b495b7c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:59 +0100 Subject: perf: Disable PERF_SAMPLE_BRANCH_* when not supported PERF_SAMPLE_BRANCH_* is disabled for: - SW events (sw counters, tracepoints) - HW breakpoints - ALL but Intel x86 architecture - AMD64 processors Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 4 ++++ arch/arm/kernel/perf_event.c | 4 ++++ arch/mips/kernel/perf_event_mipsxx.c | 4 ++++ arch/powerpc/kernel/perf_event.c | 4 ++++ arch/sh/kernel/perf_event.c | 4 ++++ arch/sparc/kernel/perf_event.c | 4 ++++ arch/x86/kernel/cpu/perf_event_amd.c | 3 +++ kernel/events/core.c | 24 ++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 6 ++++++ 9 files changed, 57 insertions(+) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8143cd7cdbfb..0dae252f7a33 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -685,6 +685,10 @@ static int alpha_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 5bb91bf3d47f..a23c42abc694 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -539,6 +539,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (armpmu->map_event(event) == -ENOENT) return -ENOENT; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index e3b897acfbc0..811084f4e422 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -606,6 +606,10 @@ static int mipspmu_event_init(struct perf_event *event) { int err = 0; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index f04c2301725e..c2e27ede07ec 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1084,6 +1084,10 @@ static int power_pmu_event_init(struct perf_event *event) if (!ppmu) return -ENOENT; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 10b14e3a7eb8..068b8a2759b5 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -310,6 +310,10 @@ static int sh_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HW_CACHE: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 614da624330c..8e16a4a21582 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1105,6 +1105,10 @@ static int sparc_pmu_event_init(struct perf_event *event) if (atomic_read(&nmi_active) < 0) return -ENODEV; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (attr->type) { case PERF_TYPE_HARDWARE: if (attr->config >= sparc_pmu->max_events) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67250a52430b..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 diff --git a/kernel/events/core.c b/kernel/events/core.c index 5820efdf47cd..242bb51c67f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5044,6 +5044,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -5154,6 +5160,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5379,6 +5391,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5453,6 +5471,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3330022a7ac1..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++--- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++ include/linux/perf_event.h | 9 +++- kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cea567483274..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f104c054dc5c..74387c12dc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -324,6 +324,7 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + void (*flush_branch_stack)(void); /* * Intel Arch Perfmon v2+ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7cc1e2dcc4dd..6627089232a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; static __init void intel_clovertown_quirk(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 242bb51c67f2..c61234b1a988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -137,6 +137,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_rotate_start(ctx->pmu); } +/* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event) atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } if (event->rb) { @@ -5924,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; -- cgit v1.2.3 From 6414fa6a150111750011f477899d370244da4171 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Mar 2012 06:38:42 +0000 Subject: aout: move setup_arg_pages() prior to reading/mapping the binary Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 14 +++++++------- fs/binfmt_aout.c | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index fd843877e841..39e49091f648 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -315,6 +315,13 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -410,13 +417,6 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); /* start thread */ diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26ae..1ff94054d35a 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -352,13 +359,6 @@ beyond_if: return retval; } - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); #ifdef __alpha__ -- cgit v1.2.3 From a628b684d27d22631d1819890f13047ae9075241 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Mar 2012 13:39:29 -0800 Subject: x32: Provide separate is_ia32_task() and is_x32_task() predicates The is_compat_task() test is composed of two predicates already, so make each of them available separately. Signed-off-by: H. Peter Anvin Cc: H. J. Lu Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/include/asm/compat.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index e7f68b49c01a..355edc091604 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -235,12 +235,17 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)round_down(sp - len, 16); } -static inline bool is_compat_task(void) +static inline bool is_ia32_task(void) { #ifdef CONFIG_IA32_EMULATION if (current_thread_info()->status & TS_COMPAT) return true; #endif + return false; +} + +static inline bool is_x32_task(void) +{ #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; @@ -248,4 +253,9 @@ static inline bool is_compat_task(void) return false; } +static inline bool is_compat_task(void) +{ + return is_ia32_task() || is_x32_task(); +} + #endif /* _ASM_X86_COMPAT_H */ -- cgit v1.2.3 From e7084fd52ed71249ab2ce7a7d89d601c9d1f904c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Mar 2012 13:40:24 -0800 Subject: x32: Switch to a 64-bit clock_t clock_t is used mainly to give the number of jiffies a certain process has burned. It is entirely feasible for a long-running process to consume more than 2^32 jiffies especially in a multiprocess system. As such, switch to a 64-bit clock_t for x32, just as we already switched to a 64-bit time_t. clock_t is only used in a handful of places, and as such it is really not a very significant change. The one that has the biggest impact is in struct siginfo, but since the *size* of struct siginfo doesn't change (it is padded to the hilt) it is fairly easy to make this a localized change. This also gets rid of sys_x32_times, however since this is a pretty late change don't compactify the system call numbers; we can reuse system call slot 521 next time we need an x32 system call. Reported-by: Gregory M. Lueck Signed-off-by: H. Peter Anvin Cc: H. J. Lu Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/ia32/ia32_signal.c | 10 ++++++++-- arch/x86/include/asm/ia32.h | 9 +++++++++ arch/x86/syscalls/syscall_64.tbl | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 25d80f3faf2e..bc09ed2a8b97 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -37,6 +37,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; + bool ia32 = !is_ia32_task(); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; @@ -66,8 +67,13 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) case __SI_FAULT >> 16: break; case __SI_CHLD >> 16: - put_user_ex(from->si_utime, &to->si_utime); - put_user_ex(from->si_stime, &to->si_stime); + if (ia32) { + put_user_ex(from->si_utime, &to->si_utime); + put_user_ex(from->si_stime, &to->si_stime); + } else { + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); + } put_user_ex(from->si_status, &to->si_status); /* FALL THROUGH */ default: diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index c6435ab1cc13..7d0c18587709 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -125,6 +125,15 @@ typedef struct compat_siginfo { compat_clock_t _stime; } _sigchld; + /* SIGCHLD (x32 version) */ + struct { + unsigned int _pid; /* which child */ + unsigned int _uid; /* sender's uid */ + int _status; /* exit code */ + s64 _utime; + s64 _stime; + } _sigchld_x32; + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ struct { unsigned int _addr; /* faulting insn/memory ref. */ diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 4aecc7e31166..0d778b800884 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -106,7 +106,7 @@ 97 common getrlimit sys_getrlimit 98 common getrusage sys_getrusage 99 common sysinfo sys_sysinfo -100 64 times sys_times +100 common times sys_times 101 common ptrace sys_ptrace 102 common getuid sys_getuid 103 common syslog sys_syslog @@ -331,7 +331,7 @@ 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg 520 x32 execve stub_x32_execve -521 x32 times compat_sys_times +# 521 available 522 x32 rt_sigpending sys32_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo -- cgit v1.2.3 From 55283e2537714f9370c4ab847d170acf223daf90 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 5 Mar 2012 15:32:11 -0800 Subject: x32: Add ptrace for x32 X32 ptrace is a hybrid of 64bit ptrace and compat ptrace with 32bit address and longs. It use 64bit ptrace to access the full 64bit registers. PTRACE_PEEKUSR and PTRACE_POKEUSR are only allowed to access segment and debug registers. PTRACE_PEEKUSR returns the lower 32bits and PTRACE_POKEUSR zero-extends 32bit value to 64bit. It works since the upper 32bits of segment and debug registers of x32 process are always zero. GDB only uses PTRACE_PEEKUSR and PTRACE_POKEUSR to access segment and debug registers. [ hpa: changed TIF_X32 test to use !is_ia32_task() instead, and moved the system call number to the now-unused 521 slot. ] Signed-off-by: "H.J. Lu" Signed-off-by: H. Peter Anvin Cc: Roland McGrath Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/kernel/ptrace.c | 99 ++++++++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscall_64.tbl | 4 +- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b766..93e7877a19c4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1130,6 +1130,100 @@ static int genregs32_set(struct task_struct *target, return ret; } +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + return do_arch_prctl(child, data, addr); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { @@ -1139,6 +1233,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, int ret; __u32 val; +#ifdef CONFIG_X86_X32_ABI + if (!is_ia32_task()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif + switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 0d778b800884..dd29a9ea27c5 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -107,7 +107,7 @@ 98 common getrusage sys_getrusage 99 common sysinfo sys_sysinfo 100 common times sys_times -101 common ptrace sys_ptrace +101 64 ptrace sys_ptrace 102 common getuid sys_getuid 103 common syslog sys_syslog 104 common getgid sys_getgid @@ -331,7 +331,7 @@ 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg 520 x32 execve stub_x32_execve -# 521 available +521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending sys32_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo -- cgit v1.2.3 From 373913b568cbfbefcee3263b98bd5a1a8b491f1b Mon Sep 17 00:00:00 2001 From: Philip Prindeville Date: Mon, 5 Mar 2012 15:05:14 -0800 Subject: x86/geode/alix2: Supplement driver to include GPIO button support GPIO 24 is used in reference designs as a soft-reset button, and the alix2 is no exception. Add it as a gpio-button. Use symbolic values to describe BIOS addresses. Record the model number. Signed-off-by: Philip A. Prindeville Acked-by: Ed Wildgoose Acked-by: Andres Salomon Cc: Matthew Garrett Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/n/tip-sjp6k1rjksitx1pej0c0qxd1@git.kernel.org [ tidied up the code a bit ] Signed-off-by: Ingo Molnar --- arch/x86/platform/geode/alix.c | 76 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index dc5f1d32aced..90e23e7679a5 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -6,6 +6,7 @@ * * Copyright (C) 2008 Constantin Baranov * Copyright (C) 2011 Ed Wildgoose + * and Philip Prindeville * * TODO: There are large similarities with leds-net5501.c * by Alessandro Zummo @@ -24,14 +25,47 @@ #include #include #include +#include +#include +#include #include +#define BIOS_SIGNATURE_TINYBIOS 0xf0000 +#define BIOS_SIGNATURE_COREBOOT 0x500 +#define BIOS_REGION_SIZE 0x10000 + static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); +static struct gpio_keys_button alix_gpio_buttons[] = { + { + .code = KEY_RESTART, + .gpio = 24, + .active_low = 1, + .desc = "Reset button", + .type = EV_KEY, + .wakeup = 0, + .debounce_interval = 100, + .can_disable = 0, + } +}; +static struct gpio_keys_platform_data alix_buttons_data = { + .buttons = alix_gpio_buttons, + .nbuttons = ARRAY_SIZE(alix_gpio_buttons), + .poll_interval = 20, +}; + +static struct platform_device alix_buttons_dev = { + .name = "gpio-keys-polled", + .id = 1, + .dev = { + .platform_data = &alix_buttons_data, + } +}; + static struct gpio_led alix_leds[] = { { .name = "alix:1", @@ -64,17 +98,22 @@ static struct platform_device alix_leds_dev = { .dev.platform_data = &alix_leds_data, }; +static struct __initdata platform_device *alix_devs[] = { + &alix_buttons_dev, + &alix_leds_dev, +}; + static void __init register_alix(void) { /* Setup LED control through leds-gpio driver */ - platform_device_register(&alix_leds_dev); + platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs)); } -static int __init alix_present(unsigned long bios_phys, +static bool __init alix_present(unsigned long bios_phys, const char *alix_sig, size_t alix_sig_len) { - const size_t bios_len = 0x00010000; + const size_t bios_len = BIOS_REGION_SIZE; const char *bios_virt; const char *scan_end; const char *p; @@ -84,7 +123,7 @@ static int __init alix_present(unsigned long bios_phys, printk(KERN_NOTICE "%s: forced to skip BIOS test, " "assume system is ALIX.2/ALIX.3\n", KBUILD_MODNAME); - return 1; + return true; } bios_virt = phys_to_virt(bios_phys); @@ -109,15 +148,33 @@ static int __init alix_present(unsigned long bios_phys, *a = '\0'; tail = p + alix_sig_len; - if ((tail[0] == '2' || tail[0] == '3')) { + if ((tail[0] == '2' || tail[0] == '3' || tail[0] == '6')) { printk(KERN_INFO "%s: system is recognized as \"%s\"\n", KBUILD_MODNAME, name); - return 1; + return true; } } - return 0; + return false; +} + +static bool __init alix_present_dmi(void) +{ + const char *vendor, *product; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor || strcmp(vendor, "PC Engines")) + return false; + + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product || (strcmp(product, "ALIX.2D") && strcmp(product, "ALIX.6"))) + return false; + + printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n", + KBUILD_MODNAME, vendor, product); + + return true; } static int __init alix_init(void) @@ -128,8 +185,9 @@ static int __init alix_init(void) if (!is_geode()) return 0; - if (alix_present(0xf0000, tinybios_sig, sizeof(tinybios_sig) - 1) || - alix_present(0x500, coreboot_sig, sizeof(coreboot_sig) - 1)) + if (alix_present(BIOS_SIGNATURE_TINYBIOS, tinybios_sig, sizeof(tinybios_sig) - 1) || + alix_present(BIOS_SIGNATURE_COREBOOT, coreboot_sig, sizeof(coreboot_sig) - 1) || + alix_present_dmi()) register_alix(); return 0; -- cgit v1.2.3 From da4e3302949f4a702f1ddfefe067762232d363d5 Mon Sep 17 00:00:00 2001 From: Philip Prindeville Date: Mon, 5 Mar 2012 15:05:15 -0800 Subject: x86/geode/net5501: Add platform driver for Soekris Engineering net5501 Add platform driver for the Soekris Engineering net5501 single-board computer. Probes well-known locations in ROM for BIOS signature to confirm correct platform. Registers 1 LED and 1 GPIO-based button (typically used for soft reset). Signed-off-by: Philip Prindeville Acked-by: Alessandro Zummo Cc: Richard Purdie Cc: Andres Salomon Cc: Matthew Garrett [ Removed Kconfig and Makefile detritus from drivers/leds/] Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/n/tip-jv5uf34996juqh5syes8mn4h@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 6 ++ arch/x86/platform/geode/Makefile | 1 + arch/x86/platform/geode/net5501.c | 154 ++++++++++++++++++++++++++++++++++++++ drivers/leds/Kconfig | 10 --- drivers/leds/Makefile | 1 - drivers/leds/leds-net5501.c | 97 ------------------------ 6 files changed, 161 insertions(+), 108 deletions(-) create mode 100644 arch/x86/platform/geode/net5501.c delete mode 100644 drivers/leds/leds-net5501.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c0d49316a63d..e76f1dbadcd6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2114,6 +2114,12 @@ config ALIX Note: You have to set alix.force=1 for boards with Award BIOS. +config NET5501 + bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)" + select GPIOLIB + ---help--- + This option enables system support for the Soekris Engineering net5501. + endif # X86_32 config AMD_NB diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile index 07c9cd05021a..246b788847ff 100644 --- a/arch/x86/platform/geode/Makefile +++ b/arch/x86/platform/geode/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_ALIX) += alix.o +obj-$(CONFIG_NET5501) += net5501.o diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c new file mode 100644 index 000000000000..66d377e334f7 --- /dev/null +++ b/arch/x86/platform/geode/net5501.c @@ -0,0 +1,154 @@ +/* + * System Specific setup for Soekris net5501 + * At the moment this means setup of GPIO control of LEDs and buttons + * on net5501 boards. + * + * + * Copyright (C) 2008-2009 Tower Technologies + * Written by Alessandro Zummo + * + * Copyright (C) 2008 Constantin Baranov + * Copyright (C) 2011 Ed Wildgoose + * and Philip Prindeville + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define BIOS_REGION_BASE 0xffff0000 +#define BIOS_REGION_SIZE 0x00010000 + +static struct gpio_keys_button net5501_gpio_buttons[] = { + { + .code = KEY_RESTART, + .gpio = 24, + .active_low = 1, + .desc = "Reset button", + .type = EV_KEY, + .wakeup = 0, + .debounce_interval = 100, + .can_disable = 0, + } +}; +static struct gpio_keys_platform_data net5501_buttons_data = { + .buttons = net5501_gpio_buttons, + .nbuttons = ARRAY_SIZE(net5501_gpio_buttons), + .poll_interval = 20, +}; + +static struct platform_device net5501_buttons_dev = { + .name = "gpio-keys-polled", + .id = 1, + .dev = { + .platform_data = &net5501_buttons_data, + } +}; + +static struct gpio_led net5501_leds[] = { + { + .name = "net5501:1", + .gpio = 6, + .default_trigger = "default-on", + .active_low = 1, + }, +}; + +static struct gpio_led_platform_data net5501_leds_data = { + .num_leds = ARRAY_SIZE(net5501_leds), + .leds = net5501_leds, +}; + +static struct platform_device net5501_leds_dev = { + .name = "leds-gpio", + .id = -1, + .dev.platform_data = &net5501_leds_data, +}; + +static struct __initdata platform_device *net5501_devs[] = { + &net5501_buttons_dev, + &net5501_leds_dev, +}; + +static void __init register_net5501(void) +{ + /* Setup LED control through leds-gpio driver */ + platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs)); +} + +struct net5501_board { + u16 offset; + u16 len; + char *sig; +}; + +static struct net5501_board __initdata boards[] = { + { 0xb7b, 7, "net5501" }, /* net5501 v1.33/1.33c */ + { 0xb1f, 7, "net5501" }, /* net5501 v1.32i */ +}; + +static bool __init net5501_present(void) +{ + int i; + unsigned char *rombase, *bios; + bool found = false; + + rombase = ioremap(BIOS_REGION_BASE, BIOS_REGION_SIZE - 1); + if (!rombase) { + printk(KERN_ERR "%s: failed to get rombase\n", KBUILD_MODNAME); + return found; + } + + bios = rombase + 0x20; /* null terminated */ + + if (memcmp(bios, "comBIOS", 7)) + goto unmap; + + for (i = 0; i < ARRAY_SIZE(boards); i++) { + unsigned char *model = rombase + boards[i].offset; + + if (!memcmp(model, boards[i].sig, boards[i].len)) { + printk(KERN_INFO "%s: system is recognized as \"%s\"\n", + KBUILD_MODNAME, model); + + found = true; + break; + } + } + +unmap: + iounmap(rombase); + return found; +} + +static int __init net5501_init(void) +{ + if (!is_geode()) + return 0; + + if (!net5501_present()) + return 0; + + register_net5501(); + + return 0; +} + +module_init(net5501_init); + +MODULE_AUTHOR("Philip Prindeville "); +MODULE_DESCRIPTION("Soekris net5501 System Setup"); +MODULE_LICENSE("GPL"); diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 9ca28fced2b9..8c7a75d53101 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -89,16 +89,6 @@ config LEDS_NET48XX This option enables support for the Soekris net4801 and net4826 error LED. -config LEDS_NET5501 - tristate "LED Support for Soekris net5501 series Error LED" - depends on LEDS_TRIGGERS - depends on X86 && GPIO_CS5535 - select LEDS_TRIGGER_DEFAULT_ON - default n - help - Add support for the Soekris net5501 board (detection, error led - and GPIO). - config LEDS_FSG tristate "LED Support for the Freecom FSG-3" depends on LEDS_CLASS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index 1fc6875a8b20..6bcf4f695515 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_LEDS_MIKROTIK_RB532) += leds-rb532.o obj-$(CONFIG_LEDS_S3C24XX) += leds-s3c24xx.o obj-$(CONFIG_LEDS_AMS_DELTA) += leds-ams-delta.o obj-$(CONFIG_LEDS_NET48XX) += leds-net48xx.o -obj-$(CONFIG_LEDS_NET5501) += leds-net5501.o obj-$(CONFIG_LEDS_WRAP) += leds-wrap.o obj-$(CONFIG_LEDS_COBALT_QUBE) += leds-cobalt-qube.o obj-$(CONFIG_LEDS_COBALT_RAQ) += leds-cobalt-raq.o diff --git a/drivers/leds/leds-net5501.c b/drivers/leds/leds-net5501.c deleted file mode 100644 index 0555d4709a7c..000000000000 --- a/drivers/leds/leds-net5501.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Soekris board support code - * - * Copyright (C) 2008-2009 Tower Technologies - * Written by Alessandro Zummo - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static const struct gpio_led net5501_leds[] = { - { - .name = "error", - .gpio = 6, - .default_trigger = "default-on", - }, -}; - -static struct gpio_led_platform_data net5501_leds_data = { - .num_leds = ARRAY_SIZE(net5501_leds), - .leds = net5501_leds, -}; - -static struct platform_device net5501_leds_dev = { - .name = "leds-gpio", - .id = -1, - .dev.platform_data = &net5501_leds_data, -}; - -static void __init init_net5501(void) -{ - platform_device_register(&net5501_leds_dev); -} - -struct soekris_board { - u16 offset; - char *sig; - u8 len; - void (*init)(void); -}; - -static struct soekris_board __initdata boards[] = { - { 0xb7b, "net5501", 7, init_net5501 }, /* net5501 v1.33/1.33c */ - { 0xb1f, "net5501", 7, init_net5501 }, /* net5501 v1.32i */ -}; - -static int __init soekris_init(void) -{ - int i; - unsigned char *rombase, *bios; - - if (!is_geode()) - return 0; - - rombase = ioremap(0xffff0000, 0xffff); - if (!rombase) { - printk(KERN_INFO "Soekris net5501 LED driver failed to get rombase"); - return 0; - } - - bios = rombase + 0x20; /* null terminated */ - - if (strncmp(bios, "comBIOS", 7)) - goto unmap; - - for (i = 0; i < ARRAY_SIZE(boards); i++) { - unsigned char *model = rombase + boards[i].offset; - - if (strncmp(model, boards[i].sig, boards[i].len) == 0) { - printk(KERN_INFO "Soekris %s: %s\n", model, bios); - - if (boards[i].init) - boards[i].init(); - break; - } - } - -unmap: - iounmap(rombase); - return 0; -} - -arch_initcall(soekris_init); - -MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 86b4ce3156c0dc140907ad03639564000cde694f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:09 +0900 Subject: x86/kprobes: Fix instruction recovery on optimized path Current probed-instruction recovery expects that only breakpoint instruction modifies instruction. However, since kprobes jump optimization can replace original instructions with a jump, that expectation is not enough. And it may cause instruction decoding failure on the function where an optimized probe already exists. This bug can reproduce easily as below: 1) find a target function address (any kprobe-able function is OK) $ grep __secure_computing /proc/kallsyms ffffffff810c19d0 T __secure_computing 2) decode the function $ objdump -d vmlinux --start-address=0xffffffff810c19d0 --stop-address=0xffffffff810c19eb vmlinux: file format elf64-x86-64 Disassembly of section .text: ffffffff810c19d0 <__secure_computing>: ffffffff810c19d0: 55 push %rbp ffffffff810c19d1: 48 89 e5 mov %rsp,%rbp ffffffff810c19d4: e8 67 8f 72 00 callq ffffffff817ea940 ffffffff810c19d9: 65 48 8b 04 25 40 b8 mov %gs:0xb840,%rax ffffffff810c19e0: 00 00 ffffffff810c19e2: 83 b8 88 05 00 00 01 cmpl $0x1,0x588(%rax) ffffffff810c19e9: 74 05 je ffffffff810c19f0 <__secure_computing+0x20> 3) put a kprobe-event at an optimize-able place, where no call/jump places within the 5 bytes. $ su - # cd /sys/kernel/debug/tracing # echo p __secure_computing+0x9 > kprobe_events 4) enable it and check it is optimized. # echo 1 > events/kprobes/p___secure_computing_9/enable # cat ../kprobes/list ffffffff810c19d9 k __secure_computing+0x9 [OPTIMIZED] 5) put another kprobe on an instruction after previous probe in the same function. # echo p __secure_computing+0x12 >> kprobe_events bash: echo: write error: Invalid argument # dmesg | tail -n 1 [ 1666.500016] Probing address(0xffffffff810c19e2) is not an instruction boundary. 6) however, if the kprobes optimization is disabled, it works. # echo 0 > /proc/sys/debug/kprobes-optimization # cat ../kprobes/list ffffffff810c19d9 k __secure_computing+0x9 # echo p __secure_computing+0x12 >> kprobe_events (no error) This is because kprobes doesn't recover the instruction which is overwritten with a relative jump by another kprobe when finding instruction boundary. It only recovers the breakpoint instruction. This patch fixes kprobes to recover such instructions. With this fix: # echo p __secure_computing+0x9 > kprobe_events # echo 1 > events/kprobes/p___secure_computing_9/enable # cat ../kprobes/list ffffffff810c1aa9 k __secure_computing+0x9 [OPTIMIZED] # echo p __secure_computing+0x12 >> kprobe_events # cat ../kprobes/list ffffffff810c1aa9 k __secure_computing+0x9 [OPTIMIZED] ffffffff810c1ab2 k __secure_computing+0x12 [DISABLED] Changes in v4: - Fix a bug to ensure optimized probe is really optimized by jump. - Remove kprobe_optready() dependency. - Cleanup code for preparing optprobe separation. Changes in v3: - Fix a build error when CONFIG_OPTPROBE=n. (Thanks, Ingo!) To fix the error, split optprobe instruction recovering path from kprobes path. - Cleanup comments/styles. Changes in v2: - Fix a bug to recover original instruction address in RIP-relative instruction fixup. - Moved on tip/master. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133209.5982.36568.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 140 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 97 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7da647d8b64c..6bec22f514b5 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -207,13 +207,15 @@ retry: } } -/* Recover the probed instruction at addr for further analysis. */ -static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, + unsigned long addr) { struct kprobe *kp; + kp = get_kprobe((void *)addr); + /* There is no probe, return original address */ if (!kp) - return -EINVAL; + return addr; /* * Basically, kp->ainsn.insn has an original instruction. @@ -230,14 +232,76 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) */ memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); buf[0] = kp->opcode; - return 0; + return (unsigned long)buf; +} + +#ifdef CONFIG_OPTPROBES +static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} +#else +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + return addr; +} +#endif + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + */ +static unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); } /* Check if paddr is at an instruction boundary */ static int __kprobes can_probe(unsigned long paddr) { - int ret; - unsigned long addr, offset = 0; + unsigned long addr, __addr, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -247,26 +311,24 @@ static int __kprobes can_probe(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { - kernel_insn_init(&insn, (void *)addr); - insn_get_opcode(&insn); - /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, addr); - if (ret) - /* - * Another debugging subsystem might insert - * this breakpoint. In that case, we can't - * recover it. - */ - return 0; - kernel_insn_init(&insn, buf); - } + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); insn_get_length(&insn); + + /* + * Another debugging subsystem might insert this breakpoint. + * In that case, we can't recover it. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; addr += insn.length; } @@ -302,21 +364,17 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { struct insn insn; - int ret; kprobe_opcode_t buf[MAX_INSN_SIZE]; + u8 *orig_src = src; /* Back up original src for RIP calculation */ + + if (recover) + src = (u8 *)recover_probed_instruction(buf, (unsigned long)src); kernel_insn_init(&insn, src); - if (recover) { - insn_get_opcode(&insn); - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, - (unsigned long)src); - if (ret) - return 0; - kernel_insn_init(&insn, buf); - } - } insn_get_length(&insn); + /* Another subsystem puts a breakpoint, failed to recover */ + if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; memcpy(dest, insn.kaddr, insn.length); #ifdef CONFIG_X86_64 @@ -337,8 +395,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) src + (s64) insn.displacement.value - - (u8 *) dest; + newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; @@ -1271,8 +1328,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) /* Decode whole function to ensure any instructions don't jump into target */ static int __kprobes can_optimize(unsigned long paddr) { - int ret; - unsigned long addr, size = 0, offset = 0; + unsigned long addr, __addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -1301,15 +1357,12 @@ static int __kprobes can_optimize(unsigned long paddr) * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)addr); - insn_get_opcode(&insn); - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { - ret = recover_probed_instruction(buf, addr); - if (ret) - return 0; - kernel_insn_init(&insn, buf); - } + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); @@ -1366,6 +1419,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) /* * Copy replacing target instructions * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. */ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) { -- cgit v1.2.3 From 464846888d9aad186cab3acdae6b654f9eb19772 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:16 +0900 Subject: x86/kprobes: Fix a bug which can modify kernel code permanently Fix a bug in kprobes which can modify kernel code permanently at run-time. In the result, kernel can crash when it executes the modified code. This bug can happen when we put two probes enough near and the first probe is optimized. When the second probe is set up, it copies a byte which is already modified by the first probe, and executes it when the probe is hit. Even worse, the first probe and the second probe are removed respectively, the second probe writes back the copied (modified) instruction. To fix this bug, kprobes always recovers the original code and copies the first byte from recovered instruction. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133215.5982.31991.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 6bec22f514b5..ca6d450bee7e 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -361,19 +361,15 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) * If not, return null. * Only applicable to 64-bit x86. */ -static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) +static int __kprobes __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - u8 *orig_src = src; /* Back up original src for RIP calculation */ - if (recover) - src = (u8 *)recover_probed_instruction(buf, (unsigned long)src); - - kernel_insn_init(&insn, src); + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ - if (recover && insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; memcpy(dest, insn.kaddr, insn.length); @@ -395,7 +391,7 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) orig_src + (s64) insn.displacement.value - (u8 *) dest; + newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; @@ -406,18 +402,20 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) static void __kprobes arch_copy_kprobe(struct kprobe *p) { + /* Copy an instruction with recovering if other optprobe modifies it.*/ + __copy_instruction(p->ainsn.insn, p->addr); + /* - * Copy an instruction without recovering int3, because it will be - * put by another subsystem. + * __copy_instruction can modify the displacement of the instruction, + * but it doesn't affect boostable check. */ - __copy_instruction(p->ainsn.insn, p->addr, 0); - - if (can_boost(p->addr)) + if (can_boost(p->ainsn.insn)) p->ainsn.boostable = 0; else p->ainsn.boostable = -1; - p->opcode = *p->addr; + /* Also, displacement change doesn't affect the first byte */ + p->opcode = p->ainsn.insn[0]; } int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -1276,7 +1274,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len, 1); + ret = __copy_instruction(dest + len, src + len); if (!ret || !can_boost(dest + len)) return -EINVAL; len += ret; @@ -1328,7 +1326,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) /* Decode whole function to ensure any instructions don't jump into target */ static int __kprobes can_optimize(unsigned long paddr) { - unsigned long addr, __addr, size = 0, offset = 0; + unsigned long addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -1357,8 +1355,7 @@ static int __kprobes can_optimize(unsigned long paddr) * we can't optimize kprobe in this function. */ return 0; - __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) -- cgit v1.2.3 From 3f33ab1c0c741bfab2138c14ba1918a7905a1e8b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 5 Mar 2012 22:32:22 +0900 Subject: x86/kprobes: Split out optprobe related code to kprobes-opt.c Split out optprobe related code to arch/x86/kernel/kprobes-opt.c for maintenanceability. Signed-off-by: Masami Hiramatsu Suggested-by: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: systemtap@sourceware.org Cc: anderson@redhat.com Link: http://lkml.kernel.org/r/20120305133222.5982.54794.stgit@localhost.localdomain [ Tidied up the code a tiny bit ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kprobes-common.h | 102 +++++++ arch/x86/kernel/kprobes-opt.c | 512 ++++++++++++++++++++++++++++++++ arch/x86/kernel/kprobes.c | 625 ++------------------------------------- 4 files changed, 646 insertions(+), 594 deletions(-) create mode 100644 arch/x86/kernel/kprobes-common.h create mode 100644 arch/x86/kernel/kprobes-opt.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a9..532d2e090e6f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_OPTPROBES) += kprobes-opt.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h new file mode 100644 index 000000000000..3230b68ef29a --- /dev/null +++ b/arch/x86/kernel/kprobes-common.h @@ -0,0 +1,102 @@ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(kprobe_opcode_t *instruction); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *from, void *to); +extern void synthesize_relcall(void *from, void *to); + +#ifdef CONFIG_OPTPROBES +extern int arch_init_optprobes(void); +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else /* !CONFIG_OPTPROBES */ +static inline int arch_init_optprobes(void) +{ + return 0; +} +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + return addr; +} +#endif +#endif diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c new file mode 100644 index 000000000000..c5e410eed403 --- /dev/null +++ b/arch/x86/kernel/kprobes-opt.c @@ -0,0 +1,512 @@ +/* + * Kernel Probes Jump Optimization (Optprobes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "kprobes-common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +static void __used __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end:\n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; + + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + local_irq_save(flags); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __this_cpu_write(current_kprobe, &op->kp); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + local_irq_restore(flags); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes +arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +int __kprobes +setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __kprobes arch_init_optprobes(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index ca6d450bee7e..e213fc8408d2 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -30,16 +30,15 @@ * and Prasanna S Panchamukhi * added function-return probes. * 2005-May Rusty Lynch - * Added function return probes functionality + * Added function return probes functionality * 2006-Feb Masami Hiramatsu added - * kprobe-booster and kretprobe-booster for i386. + * kprobe-booster and kretprobe-booster for i386. * 2007-Dec Masami Hiramatsu added kprobe-booster - * and kretprobe-booster for x86-64 + * and kretprobe-booster for x86-64 * 2007-Dec Masami Hiramatsu , Arjan van de Ven - * and Jim Keniston - * unified x86 kprobes code. + * and Jim Keniston + * unified x86 kprobes code. */ - #include #include #include @@ -59,6 +58,8 @@ #include #include +#include "kprobes-common.h" + void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; @@ -108,6 +109,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { doesn't switch kernel stack.*/ {NULL, NULL} /* Terminator */ }; + const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) @@ -123,11 +125,17 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes synthesize_reljump(void *from, void *to) +void __kprobes synthesize_reljump(void *from, void *to) { __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + /* * Skip the prefixes of the instruction. */ @@ -151,7 +159,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) * Returns non-zero if opcode is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -static int __kprobes can_boost(kprobe_opcode_t *opcodes) +int __kprobes can_boost(kprobe_opcode_t *opcodes) { kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; @@ -207,8 +215,8 @@ retry: } } -static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, - unsigned long addr) +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; @@ -235,59 +243,12 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, return (unsigned long)buf; } -#ifdef CONFIG_OPTPROBES -static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, - unsigned long addr) -{ - struct optimized_kprobe *op; - struct kprobe *kp; - long offs; - int i; - - for (i = 0; i < RELATIVEJUMP_SIZE; i++) { - kp = get_kprobe((void *)addr - i); - /* This function only handles jump-optimized kprobe */ - if (kp && kprobe_optimized(kp)) { - op = container_of(kp, struct optimized_kprobe, kp); - /* If op->list is not empty, op is under optimizing */ - if (list_empty(&op->list)) - goto found; - } - } - - return addr; -found: - /* - * If the kprobe can be optimized, original bytes which can be - * overwritten by jump destination address. In this case, original - * bytes must be recovered from op->optinsn.copied_insn buffer. - */ - memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - if (addr == (unsigned long)kp->addr) { - buf[0] = kp->opcode; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - } else { - offs = addr - (unsigned long)kp->addr - 1; - memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); - } - - return (unsigned long)buf; -} -#else -static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, - unsigned long addr) -{ - return addr; -} -#endif - /* * Recover the probed instruction at addr for further analysis. * Caller must lock kprobes by kprobe_mutex, or disable preemption * for preventing to release referencing kprobes. */ -static unsigned long recover_probed_instruction(kprobe_opcode_t *buf, - unsigned long addr) +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) { unsigned long __addr; @@ -361,7 +322,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) * If not, return null. * Only applicable to 64-bit x86. */ -static int __kprobes __copy_instruction(u8 *dest, u8 *src) +int __kprobes __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -497,8 +458,8 @@ static void __kprobes restore_btf(void) } } -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) +void __kprobes +arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long *sara = stack_addr(regs); @@ -508,16 +469,8 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } -#ifdef CONFIG_OPTPROBES -static int __kprobes setup_detour_execution(struct kprobe *p, - struct pt_regs *regs, - int reenter); -#else -#define setup_detour_execution(p, regs, reenter) (0) -#endif - -static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, int reenter) +static void __kprobes +setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { if (setup_detour_execution(p, regs, reenter)) return; @@ -559,8 +512,8 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. */ -static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) +static int __kprobes +reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: @@ -655,69 +608,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } -#ifdef CONFIG_X86_64 -#define SAVE_REGS_STRING \ - /* Skip cs, ip, orig_ax. */ \ - " subq $24, %rsp\n" \ - " pushq %rdi\n" \ - " pushq %rsi\n" \ - " pushq %rdx\n" \ - " pushq %rcx\n" \ - " pushq %rax\n" \ - " pushq %r8\n" \ - " pushq %r9\n" \ - " pushq %r10\n" \ - " pushq %r11\n" \ - " pushq %rbx\n" \ - " pushq %rbp\n" \ - " pushq %r12\n" \ - " pushq %r13\n" \ - " pushq %r14\n" \ - " pushq %r15\n" -#define RESTORE_REGS_STRING \ - " popq %r15\n" \ - " popq %r14\n" \ - " popq %r13\n" \ - " popq %r12\n" \ - " popq %rbp\n" \ - " popq %rbx\n" \ - " popq %r11\n" \ - " popq %r10\n" \ - " popq %r9\n" \ - " popq %r8\n" \ - " popq %rax\n" \ - " popq %rcx\n" \ - " popq %rdx\n" \ - " popq %rsi\n" \ - " popq %rdi\n" \ - /* Skip orig_ax, ip, cs */ \ - " addq $24, %rsp\n" -#else -#define SAVE_REGS_STRING \ - /* Skip cs, ip, orig_ax and gs. */ \ - " subl $16, %esp\n" \ - " pushl %fs\n" \ - " pushl %es\n" \ - " pushl %ds\n" \ - " pushl %eax\n" \ - " pushl %ebp\n" \ - " pushl %edi\n" \ - " pushl %esi\n" \ - " pushl %edx\n" \ - " pushl %ecx\n" \ - " pushl %ebx\n" -#define RESTORE_REGS_STRING \ - " popl %ebx\n" \ - " popl %ecx\n" \ - " popl %edx\n" \ - " popl %esi\n" \ - " popl %edi\n" \ - " popl %ebp\n" \ - " popl %eax\n" \ - /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ - " addl $24, %esp\n" -#endif - /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -871,8 +761,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) * jump instruction after the copied instruction, that jumps to the next * instruction after the probepoint. */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static void __kprobes +resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; @@ -1051,8 +941,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) /* * Wrapper routine for handling exceptions. */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +int __kprobes +kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = data; int ret = NOTIFY_DONE; @@ -1162,462 +1052,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } - -#ifdef CONFIG_OPTPROBES - -/* Insert a call instruction at address 'from', which calls address 'to'.*/ -static void __kprobes synthesize_relcall(void *from, void *to) -{ - __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); -} - -/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ -static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, - unsigned long val) -{ -#ifdef CONFIG_X86_64 - *addr++ = 0x48; - *addr++ = 0xbf; -#else - *addr++ = 0xb8; -#endif - *(unsigned long *)addr = val; -} - -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( - ".global optprobe_template_entry\n" - "optprobe_template_entry: \n" -#ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rsi\n" - ".global optprobe_template_val\n" - "optprobe_template_val: \n" - ASM_NOP5 - ASM_NOP5 - ".global optprobe_template_call\n" - "optprobe_template_call: \n" - ASM_NOP5 - /* Move flags to rsp */ - " movq 144(%rsp), %rdx\n" - " movq %rdx, 152(%rsp)\n" - RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" - " popfq\n" -#else /* CONFIG_X86_32 */ - " pushf\n" - SAVE_REGS_STRING - " movl %esp, %edx\n" - ".global optprobe_template_val\n" - "optprobe_template_val: \n" - ASM_NOP5 - ".global optprobe_template_call\n" - "optprobe_template_call: \n" - ASM_NOP5 - RESTORE_REGS_STRING - " addl $4, %esp\n" /* skip cs */ - " popf\n" -#endif - ".global optprobe_template_end\n" - "optprobe_template_end: \n"); -} - -#define TMPL_MOVE_IDX \ - ((long)&optprobe_template_val - (long)&optprobe_template_entry) -#define TMPL_CALL_IDX \ - ((long)&optprobe_template_call - (long)&optprobe_template_entry) -#define TMPL_END_IDX \ - ((long)&optprobe_template_end - (long)&optprobe_template_entry) - -#define INT3_SIZE sizeof(kprobe_opcode_t) - -/* Optimized kprobe call back function: called from optinsn */ -static void __kprobes optimized_callback(struct optimized_kprobe *op, - struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - - /* This is possible if op is under delayed unoptimizing */ - if (kprobe_disabled(&op->kp)) - return; - - local_irq_save(flags); - if (kprobe_running()) { - kprobes_inc_nmissed_count(&op->kp); - } else { - /* Save skipped registers */ -#ifdef CONFIG_X86_64 - regs->cs = __KERNEL_CS; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); - regs->gs = 0; -#endif - regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; - regs->orig_ax = ~0UL; - - __this_cpu_write(current_kprobe, &op->kp); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - opt_pre_handler(&op->kp, regs); - __this_cpu_write(current_kprobe, NULL); - } - local_irq_restore(flags); -} - -static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) -{ - int len = 0, ret; - - while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len); - if (!ret || !can_boost(dest + len)) - return -EINVAL; - len += ret; - } - /* Check whether the address range is reserved */ - if (ftrace_text_reserved(src, src + len - 1) || - alternatives_text_reserved(src, src + len - 1) || - jump_label_text_reserved(src, src + len - 1)) - return -EBUSY; - - return len; -} - -/* Check whether insn is indirect jump */ -static int __kprobes insn_is_indirect_jump(struct insn *insn) -{ - return ((insn->opcode.bytes[0] == 0xff && - (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ - insn->opcode.bytes[0] == 0xea); /* Segment based jump */ -} - -/* Check whether insn jumps into specified address range */ -static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) -{ - unsigned long target = 0; - - switch (insn->opcode.bytes[0]) { - case 0xe0: /* loopne */ - case 0xe1: /* loope */ - case 0xe2: /* loop */ - case 0xe3: /* jcxz */ - case 0xe9: /* near relative jump */ - case 0xeb: /* short relative jump */ - break; - case 0x0f: - if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ - break; - return 0; - default: - if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ - break; - return 0; - } - target = (unsigned long)insn->next_byte + insn->immediate.value; - - return (start <= target && target <= start + len); -} - -/* Decode whole function to ensure any instructions don't jump into target */ -static int __kprobes can_optimize(unsigned long paddr) -{ - unsigned long addr, size = 0, offset = 0; - struct insn insn; - kprobe_opcode_t buf[MAX_INSN_SIZE]; - - /* Lookup symbol including addr */ - if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) - return 0; - - /* - * Do not optimize in the entry code due to the unstable - * stack handling. - */ - if ((paddr >= (unsigned long )__entry_text_start) && - (paddr < (unsigned long )__entry_text_end)) - return 0; - - /* Check there is enough space for a relative jump. */ - if (size - offset < RELATIVEJUMP_SIZE) - return 0; - - /* Decode instructions */ - addr = paddr - offset; - while (addr < paddr - offset + size) { /* Decode until function end */ - if (search_exception_tables(addr)) - /* - * Since some fixup code will jumps into this function, - * we can't optimize kprobe in this function. - */ - return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); - insn_get_length(&insn); - /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) - return 0; - /* Recover address */ - insn.kaddr = (void *)addr; - insn.next_byte = (void *)(addr + insn.length); - /* Check any instructions don't jump into target */ - if (insn_is_indirect_jump(&insn) || - insn_jump_into_range(&insn, paddr + INT3_SIZE, - RELATIVE_ADDR_SIZE)) - return 0; - addr += insn.length; - } - - return 1; -} - -/* Check optimized_kprobe can actually be optimized. */ -int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) -{ - int i; - struct kprobe *p; - - for (i = 1; i < op->optinsn.size; i++) { - p = get_kprobe(op->kp.addr + i); - if (p && !kprobe_disabled(p)) - return -EEXIST; - } - - return 0; -} - -/* Check the addr is within the optimized instructions. */ -int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) -{ - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + op->optinsn.size > addr); -} - -/* Free optimized instruction slot */ -static __kprobes -void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) -{ - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); - op->optinsn.insn = NULL; - op->optinsn.size = 0; - } -} - -void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) -{ - __arch_remove_optimized_kprobe(op, 1); -} - -/* - * Copy replacing target instructions - * Target instructions MUST be relocatable (checked inside) - * This is called when new aggr(opt)probe is allocated or reused. - */ -int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) -{ - u8 *buf; - int ret; - long rel; - - if (!can_optimize((unsigned long)op->kp.addr)) - return -EILSEQ; - - op->optinsn.insn = get_optinsn_slot(); - if (!op->optinsn.insn) - return -ENOMEM; - - /* - * Verify if the address gap is in 2GB range, because this uses - * a relative jump. - */ - rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; - if (abs(rel) > 0x7fffffff) - return -ERANGE; - - buf = (u8 *)op->optinsn.insn; - - /* Copy instructions into the out-of-line buffer */ - ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); - if (ret < 0) { - __arch_remove_optimized_kprobe(op, 0); - return ret; - } - op->optinsn.size = ret; - - /* Copy arch-dep-instance from template */ - memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); - - /* Set probe information */ - synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); - - /* Set probe function call */ - synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); - - /* Set returning jmp instruction at the tail of out-of-line buffer */ - synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, - (u8 *)op->kp.addr + op->optinsn.size); - - flush_icache_range((unsigned long) buf, - (unsigned long) buf + TMPL_END_IDX + - op->optinsn.size + RELATIVEJUMP_SIZE); - return 0; -} - -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Replace breakpoints (int3) with relative jumps. - * Caller must call with locking kprobe_mutex and text_mutex. - */ -void __kprobes arch_optimize_kprobes(struct list_head *oplist) -{ - struct optimized_kprobe *op, *tmp; - int c = 0; - - list_for_each_entry_safe(op, tmp, oplist, list) { - WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); - list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; - } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - /* Set int3 to first byte for kprobes */ - insn_buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - -/* - * Recover original instructions and breakpoints from relative jumps. - * Caller must call with locking kprobe_mutex. - */ -extern void arch_unoptimize_kprobes(struct list_head *oplist, - struct list_head *done_list) -{ - struct optimized_kprobe *op, *tmp; - int c = 0; - - list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); - list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; - } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); -} - -static int __kprobes setup_detour_execution(struct kprobe *p, - struct pt_regs *regs, - int reenter) -{ - struct optimized_kprobe *op; - - if (p->flags & KPROBE_FLAG_OPTIMIZED) { - /* This kprobe is really able to run optimized path. */ - op = container_of(p, struct optimized_kprobe, kp); - /* Detour through copied instructions */ - regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; - if (!reenter) - reset_current_kprobe(); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static int __kprobes init_poke_params(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} -#else /* !CONFIG_OPTPROBES */ -static int __kprobes init_poke_params(void) -{ - return 0; -} -#endif - int __init arch_init_kprobes(void) { - return init_poke_params(); + return arch_init_optprobes(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) -- cgit v1.2.3 From d1f42e314c9c50541c79a6edf2b4cab63fe02ee3 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Mon, 5 Mar 2012 15:01:00 -0800 Subject: x86/olpc/xo15/sci: Enable lid close wakeup control Like most systems, OLPC's ACPI LID switch wakes up the system when the lid is opened, but not when it is closed. Under OLPC's opportunistic suspend model, the lid may be closed while the system was oportunistically suspended with the screen running. In this event, we want to wake up to turn the screen off. Enable control of normal ACPI wakeups through lid close events through a new sysfs attribute "lid_wake_on_closed". When set, and when LID wakeups are enabled through ACPI, the system will wake up on both open and close lid events. Signed-off-by: Daniel Drake Cc: Andres Salomon Cc: Matthew Garrett [ Fixed sscanf checking] Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/n/tip-bgt8hxu2wwe0x5p8edhogtf7@git.kernel.org [ Did very minor readability tweaks ] Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc-xo15-sci.c | 72 +++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 2b235b77d9ab..23e5b9d7977b 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -23,7 +23,66 @@ #define XO15_SCI_CLASS DRV_NAME #define XO15_SCI_DEVICE_NAME "OLPC XO-1.5 SCI" -static unsigned long xo15_sci_gpe; +static unsigned long xo15_sci_gpe; +static bool lid_wake_on_close; + +/* + * The normal ACPI LID wakeup behavior is wake-on-open, but not + * wake-on-close. This is implemented as standard by the XO-1.5 DSDT. + * + * We provide here a sysfs attribute that will additionally enable + * wake-on-close behavior. This is useful (e.g.) when we oportunistically + * suspend with the display running; if the lid is then closed, we want to + * wake up to turn the display off. + * + * This is controlled through a custom method in the XO-1.5 DSDT. + */ +static int set_lid_wake_behavior(bool wake_on_close) +{ + struct acpi_object_list arg_list; + union acpi_object arg; + acpi_status status; + + arg_list.count = 1; + arg_list.pointer = &arg; + arg.type = ACPI_TYPE_INTEGER; + arg.integer.value = wake_on_close; + + status = acpi_evaluate_object(NULL, "\\_SB.PCI0.LID.LIDW", &arg_list, NULL); + if (ACPI_FAILURE(status)) { + pr_warning(PFX "failed to set lid behavior\n"); + return 1; + } + + lid_wake_on_close = wake_on_close; + + return 0; +} + +static ssize_t +lid_wake_on_close_show(struct kobject *s, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", lid_wake_on_close); +} + +static ssize_t lid_wake_on_close_store(struct kobject *s, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + + if (sscanf(buf, "%u", &val) != 1) + return -EINVAL; + + set_lid_wake_behavior(!!val); + + return n; +} + +static struct kobj_attribute lid_wake_on_close_attr = + __ATTR(lid_wake_on_close, 0644, + lid_wake_on_close_show, + lid_wake_on_close_store); static void battery_status_changed(void) { @@ -91,6 +150,7 @@ static int xo15_sci_add(struct acpi_device *device) { unsigned long long tmp; acpi_status status; + int r; if (!device) return -EINVAL; @@ -112,6 +172,10 @@ static int xo15_sci_add(struct acpi_device *device) dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe); + r = sysfs_create_file(&device->dev.kobj, &lid_wake_on_close_attr.attr); + if (r) + goto err_sysfs; + /* Flush queue, and enable all SCI events */ process_sci_queue(); olpc_ec_mask_write(EC_SCI_SRC_ALL); @@ -123,6 +187,11 @@ static int xo15_sci_add(struct acpi_device *device) device_init_wakeup(&device->dev, true); return 0; + +err_sysfs: + acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); + cancel_work_sync(&sci_work); + return r; } static int xo15_sci_remove(struct acpi_device *device, int type) @@ -130,6 +199,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type) acpi_disable_gpe(NULL, xo15_sci_gpe); acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); cancel_work_sync(&sci_work); + sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr); return 0; } -- cgit v1.2.3 From 097d59106a8e4b42d07c9892fdd7790f1659c6ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 18:23:36 -0800 Subject: vm: avoid using find_vma_prev() unnecessarily Several users of "find_vma_prev()" were not in fact interested in the previous vma if there was no primary vma to be found either. And in those cases, we're much better off just using the regular "find_vma()", and then "prev" can be looked up by just checking vma->vm_prev. The find_vma_prev() semantics are fairly subtle (see Mikulas' recent commit 83cd904d271b: "mm: fix find_vma_prev"), and the whole "return prev by reference" means that it generates worse code too. Thus this "let's avoid using this inconvenient and clearly too subtle interface when we don't really have to" patch. Cc: Mikulas Patocka Cc: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 4 +++- mm/mempolicy.c | 3 ++- mm/mlock.c | 3 ++- mm/mprotect.c | 3 ++- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f581a18c0d4d..83e7141c3982 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -333,13 +333,15 @@ try_again: * Lookup failure means no vma is above this address, * i.e. return with success: */ - if (!(vma = find_vma_prev(mm, addr, &prev_vma))) + vma = find_vma(mm, add); + if (!vma) return addr; /* * new region fits between prev_vma->vm_end and * vma->vm_start, use it: */ + prev_vma = vma->vm_prev; if (addr + len <= vma->vm_start && (!prev_vma || (addr >= prev_vma->vm_end))) { /* remember the address as a hint for next time */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 06b145fb64ab..47296fee23db 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmstart; unsigned long vmend; - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma || vma->vm_start > start) return -EFAULT; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65d..ef726e8aa8e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) return -EINVAL; if (end == start) return 0; - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..f437d054c3bf 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, down_write(¤t->mm->mmap_sem); - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; + prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; -- cgit v1.2.3 From 55062d061790b43aee01ab3f9ac57b8596254f19 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 18:48:13 -0800 Subject: x86: fix typo in recent find_vma_prev purge It turns out that test-compiling this file on x86-64 doesn't really help, because much of it is x86-32-specific. And so I hadn't noticed the slightly over-eager removal of the 'r' from 'addr' variable despite thinking I had tested it. Signed-off-by: Linus "oopsie" Torvalds --- arch/x86/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 83e7141c3982..8ecbb4bba4b3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -333,7 +333,7 @@ try_again: * Lookup failure means no vma is above this address, * i.e. return with success: */ - vma = find_vma(mm, add); + vma = find_vma(mm, addr); if (!vma) return addr; -- cgit v1.2.3 From b11e3d782b9c065b3b2fb543bfb0d97801822dc0 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 7 Mar 2012 11:44:29 +0100 Subject: x86, mce: Fix rcu splat in drain_mce_log_buffer() While booting, the following message is seen: [ 21.665087] =============================== [ 21.669439] [ INFO: suspicious RCU usage. ] [ 21.673798] 3.2.0-0.0.0.28.36b5ec9-default #2 Not tainted [ 21.681353] ------------------------------- [ 21.685864] arch/x86/kernel/cpu/mcheck/mce.c:194 suspicious rcu_dereference_index_check() usage! [ 21.695013] [ 21.695014] other info that might help us debug this: [ 21.695016] [ 21.703488] [ 21.703489] rcu_scheduler_active = 1, debug_locks = 1 [ 21.710426] 3 locks held by modprobe/2139: [ 21.714754] #0: (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x53/0xa0 [ 21.725020] #1: [ 21.725323] ioatdma: Intel(R) QuickData Technology Driver 4.00 [ 21.733206] (&__lockdep_no_validate__){......}, at: [] __driver_attach+0x61/0xa0 [ 21.743015] #2: (i7core_edac_lock){+.+.+.}, at: [] i7core_probe+0x1f/0x5c0 [i7core_edac] [ 21.753708] [ 21.753709] stack backtrace: [ 21.758429] Pid: 2139, comm: modprobe Not tainted 3.2.0-0.0.0.28.36b5ec9-default #2 [ 21.768253] Call Trace: [ 21.770838] [] lockdep_rcu_suspicious+0xcd/0x100 [ 21.777366] [] drain_mcelog_buffer+0x191/0x1b0 [ 21.783715] [] mce_register_decode_chain+0x18/0x20 [ 21.790430] [] i7core_register_mci+0x2fb/0x3e4 [i7core_edac] [ 21.798003] [] i7core_probe+0xd4/0x5c0 [i7core_edac] [ 21.804809] [] local_pci_probe+0x5b/0xe0 [ 21.810631] [] __pci_device_probe+0xd9/0xe0 [ 21.816650] [] ? get_device+0x14/0x20 [ 21.822178] [] pci_device_probe+0x36/0x60 [ 21.828061] [] really_probe+0x7a/0x2b0 [ 21.833676] [] driver_probe_device+0x63/0xc0 [ 21.839868] [] __driver_attach+0x9b/0xa0 [ 21.845718] [] ? driver_probe_device+0xc0/0xc0 [ 21.852027] [] bus_for_each_dev+0x68/0x90 [ 21.857876] [] driver_attach+0x1c/0x20 [ 21.863462] [] bus_add_driver+0x16d/0x2b0 [ 21.869377] [] driver_register+0x7c/0x160 [ 21.875220] [] __pci_register_driver+0x6a/0xf0 [ 21.881494] [] ? 0xffffffffa01fdfff [ 21.886846] [] i7core_init+0x47/0x1000 [i7core_edac] [ 21.893737] [] do_one_initcall+0x3e/0x180 [ 21.899670] [] sys_init_module+0xc5/0x220 [ 21.905542] [] system_call_fastpath+0x16/0x1b Fix this by using ACCESS_ONCE() instead of rcu_dereference_check_mce() over mcelog.next. Since the access to each entry is controlled by the ->finished field, ACCESS_ONCE() should work just fine. An rcu_dereference is unnecessary here. Signed-off-by: Srivatsa S. Bhat Suggested-by: Paul E. McKenney Cc: Tony Luck Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..db590aff874c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -191,7 +191,7 @@ static void drain_mcelog_buffer(void) { unsigned int next, i, prev = 0; - next = rcu_dereference_check_mce(mcelog.next); + next = ACCESS_ONCE(mcelog.next); do { struct mce *m; -- cgit v1.2.3 From 0d2bf4899d04fcc7f3a280b0bc74c084badb4e04 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:18:02 +0000 Subject: x86: Tighten dependencies of CPU_SUP_*_32 Building in support for either of these CPUs is pointless when e.g. M686 was selected (since such a kernel would use cmov instructions, which aren't available on these older CPUs). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F58875A02000078000770E0@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6443c6f038e8..706e12e9984b 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -440,7 +440,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on !64BIT + depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -494,7 +494,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on M386 || M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors -- cgit v1.2.3 From c7e23289a6aa95048a78b252b462f24ca6cf7f96 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:23:14 +0000 Subject: x86/32: Print control and debug registers for kerenel context While for a user mode register dump it may be reasonable to skip those (albeit x86-64 doesn't do so), for kernel mode dumps these should be printed to make sure all information possibly necessary for analysis is available. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F58889202000078000770E7@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed013d5..88ec9129271d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_regs(regs, 0); + __show_regs(regs, !user_mode_vm(regs)); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), -- cgit v1.2.3 From a240ada241dafe290e7532d1ddeb98fdf1419068 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:24:57 +0000 Subject: x86: Include probe_roms.h in probe_roms.c ... to ensure that declarations and definitions are in sync. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F5888F902000078000770F1@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/probe_roms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e84ce31..0bc72e2069e3 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From cc578287e3224d0da196cc1d226bdae6b068faa7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:50 -0200 Subject: KVM: Infrastructure for software and hardware based TSC rate scaling This requires some restructuring; rather than use 'virtual_tsc_khz' to indicate whether hardware rate scaling is in effect, we consider each VCPU to always have a virtual TSC rate. Instead, there is new logic above the vendor-specific hardware scaling that decides whether it is even necessary to use and updates all rate variables used by common code. This means we can simply query the virtual rate at any point, which is needed for software rate scaling. There is also now a threshold added to the TSC rate scaling; minor differences and variations of measured TSC rate can accidentally provoke rate scaling to be used when it is not needed. Instead, we have a tolerance variable called tsc_tolerance_ppm, which is the maximum variation from user requested rate at which scaling will be used. The default is 250ppm, which is the half the threshold for NTP adjustment, allowing for some hardware variation. In the event that hardware rate scaling is not available, we can kludge a bit by forcing TSC catchup to turn on when a faster than hardware speed has been requested, but there is nothing available yet for the reverse case; this requires a trap and emulate software implementation for RDTSC, which is still forthcoming. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 20 ++++++---- arch/x86/kvm/vmx.c | 16 +++++--- arch/x86/kvm/x86.c | 82 +++++++++++++++++++++-------------------- 5 files changed, 71 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 782d973b0719..ddebbe01fff9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,10 +422,11 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; - u32 virtual_tsc_khz; bool tsc_catchup; - u32 tsc_catchup_mult; - s8 tsc_catchup_shift; + bool tsc_always_catchup; + s8 virtual_tsc_shift; + u32 virtual_tsc_mult; + u32 virtual_tsc_khz; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -651,7 +652,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ee1d83c695d..72975f758c83 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7bbd17cc3488..e12026e5244e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -964,20 +964,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -992,7 +997,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad906..e6bf61fa1c03 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1817,13 +1817,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2bd77a3a41ed..41bb90acb238 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -968,49 +972,50 @@ static inline u64 get_kernel_ns(void) static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; + u32 thresh_lo, thresh_hi; + int use_scaling = 0; - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} - -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1077,7 +1082,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -2804,26 +2809,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; goto out; } default: @@ -5312,6 +5312,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_lapic_sync_from_vapic(vcpu); @@ -6004,7 +6006,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) -- cgit v1.2.3 From 5d3cb0f6a8e3af018a522ae8d36f8f7d2511b5d8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:51 -0200 Subject: KVM: Improve TSC offset matching There are a few improvements that can be made to the TSC offset matching code. First, we don't need to call the 128-bit multiply (especially on a constant number), the code works much nicer to do computation in nanosecond units. Second, the way everything is setup with software TSC rate scaling, we currently have per-cpu rates. Obviously this isn't too desirable to use in practice, but if for some reason we do change the rate of all VCPUs at runtime, then reset the TSCs, we will only want to match offsets for VCPUs running at the same rate. Finally, for the case where we have an unstable host TSC, but rate scaling is being done in hardware, we should call the platform code to compute the TSC offset, so the math is reorganized to recompute the base instead, then transform the base into an offset using the existing API. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti KVM: Fix 64-bit division in kvm_write_tsc() Breaks i386 build. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddebbe01fff9..8a34fca6c572 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -513,6 +513,7 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; + u32 last_tsc_khz; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41bb90acb238..4390f42b371f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,33 +1025,46 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 nsdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + /* n.b - signed multiplication and division required */ + nsdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(nsdiff) + : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +#endif + nsdiff -= elapsed; + if (nsdiff < 0) + nsdiff = -nsdiff; /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (nsdiff < NSEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } ns = kvm->arch.last_tsc_nsec; @@ -1059,6 +1072,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3 From 4dd7980b21408624e9b6f3df05719c3c61db6e9f Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:52 -0200 Subject: KVM: Leave TSC synchronization window open with each new sync Currently, when the TSC is written by the guest, the variable ns is updated to force the current write to appear to have taken place at the time of the first write in this sync phase. This leaves a cliff at the end of the match window where updates will fall of the end. There are two scenarios where this can be a problem in practe - first, on a system with a large number of VCPUs, the sync period may last for an extended period of time. The second way this can happen is if the VM reboots very rapidly and we catch a VCPU TSC synchronization just around the edge. We may be unaware of the reboot, and thus the first VCPU might synchronize with an old set of the timer (at, say 0.97 seconds ago, when first powered on). The second VCPU can come in 0.04 seconds later to try to synchronize, but it misses the window because it is just over the threshold. Instead, stop doing this artificial setback of the ns variable and just update it with every write of the TSC. It may be observed that doing so causes values computed by compute_guest_tsc to diverge slightly across CPUs - note that the last_tsc_ns and last_tsc_write variable are used here, and now they last_tsc_ns will be different for each VCPU, reflecting the actual time of the update. However, compute_guest_tsc is used only for guests which already have TSC stability issues, and further, note that the previous patch has caused last_tsc_write to be incremented by the difference in nanoseconds, converted back into guest cycles. As such, only boundary rounding errors should be visible, which given the resolution in nanoseconds, is going to only be a few cycles and only visible in cross-CPU consistency tests. The problem can be fixed by adding a new set of variables to track the start offset and start write value for the current sync cycle. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4390f42b371f..030d495e5c78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1067,7 +1067,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } - ns = kvm->arch.last_tsc_nsec; } kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; -- cgit v1.2.3 From b183aa580a3a09b5d79224a9022418508532c778 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:53 -0200 Subject: KVM: Fix last_guest_tsc / tsc_offset semantics The variable last_guest_tsc was being used as an ad-hoc indicator that guest TSC has been initialized and recorded correctly. However, it may not have been, it could be that guest TSC has been set to some large value, the back to a small value (by, say, a software reboot). This defeats the logic and causes KVM to falsely assume that the guest TSC has gone backwards, marking the host TSC unstable, which is undesirable behavior. In addition, rather than try to compute an offset adjustment for the TSC on unstable platforms, just recompute the whole offset. This allows us to get rid of one callsite for adjust_tsc_offset, which is problematic because the units it takes are in guest units, but here, the computation was originally being done in host units. Doing this, and also recording last_guest_tsc when the TSC is written allow us to remove the tricky logic which depended on last_guest_tsc being zero to indicate a reset of uninitialized value. Instead, we now have the guarantee that the guest TSC offset is always at least something which will get us last_guest_tsc. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 030d495e5c78..2a59f76d96f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,6 +1079,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.last_tsc_write = data; vcpu->arch.last_tsc_nsec = ns; + vcpu->arch.last_guest_tsc = data; } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1147,7 +1148,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * observed by the guest and ensure the new system time is greater. */ max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + if (vcpu->hv_clock.tsc_timestamp) { max_kernel_ns = vcpu->last_guest_tsc - vcpu->hv_clock.tsc_timestamp; max_kernel_ns = pvclock_scale_delta(max_kernel_ns, @@ -2257,13 +2258,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 tsc; tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : - tsc - vcpu->arch.last_guest_tsc; + tsc_delta = tsc - vcpu->arch.last_guest_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + vcpu->arch.last_guest_tsc); + kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); -- cgit v1.2.3 From 6f526ec5383dcd5fa5ffc7b3ac1d62099a0b46ad Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:54 -0200 Subject: KVM: Add last_host_tsc tracking back to KVM The variable last_host_tsc was removed from upstream code. I am adding it back for two reasons. First, it is unnecessary to use guest TSC computation to conclude information about the host TSC. The guest may set the TSC backwards (this case handled by the previous patch), but the computation of guest TSC (and fetching an MSR) is significanlty more work and complexity than simply reading the hardware counter. In addition, we don't actually need the guest TSC for any part of the computation, by always recomputing the offset, we can eliminate the need to deal with the current offset and any scaling factors that may apply. The second reason is that later on, we are going to be using the host TSC value to restore TSC offsets after a host S4 suspend, so we need to be reading the host values, not the guest values here. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8a34fca6c572..b23682900f41 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,6 +422,7 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u64 last_host_tsc; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a59f76d96f1..39a57dac884a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2253,13 +2253,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = tsc - vcpu->arch.last_guest_tsc; - + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2282,7 +2277,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From f1e2b26003c41e581243c09ceed7567677449468 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 15:43:55 -0200 Subject: KVM: Allow adjust_tsc_offset to be in host or guest cycles Redefine the API to take a parameter indicating whether an adjustment is in host or guest cycles. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ++++++++++++- arch/x86/kvm/svm.c | 6 +++++- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b23682900f41..dd439f13df84 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -646,7 +646,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -676,6 +676,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e12026e5244e..0b7690ee20bd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1016,10 +1016,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6bf61fa1c03..575fb742a6fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1856,7 +1856,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39a57dac884a..3b931302fa55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1116,7 +1116,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } -- cgit v1.2.3 From 0dd6a6edb0124e6c71931ff575b18e15ed6e8603 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:56 -0200 Subject: KVM: Dont mark TSC unstable due to S4 suspend During a host suspend, TSC may go backwards, which KVM interprets as an unstable TSC. Technically, KVM should not be marking the TSC unstable, which causes the TSC clocksource to go bad, but we need to be adjusting the TSC offsets in such a case. Dealing with this issue is a little tricky as the only place we can reliably do it is before much of the timekeeping infrastructure is up and running. On top of this, we are not in a KVM thread context, so we may not be able to safely access VCPU fields. Instead, we compute our best known hardware offset at power-up and stash it to be applied to all VCPUs when they actually start running. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd439f13df84..4fbeb84b1818 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -423,6 +423,7 @@ struct kvm_vcpu_arch { u64 last_tsc_nsec; u64 last_tsc_write; u64 last_host_tsc; + u64 tsc_offset_adjustment; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b931302fa55..4e9bd23d522d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : native_read_tsc() - vcpu->arch.last_host_tsc; @@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) -- cgit v1.2.3 From e26101b116a6235bcd80b3a4c38c9fe91286cd79 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:57 -0200 Subject: KVM: Track TSC synchronization in generations This allows us to track the original nanosecond and counter values at each phase of TSC writing by the guest. This gets us perfect offset matching for stable TSC systems, and perfect software computed TSC matching for machines with unstable TSC. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 +++++++--- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4fbeb84b1818..c24125cd0c63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,10 +420,11 @@ struct kvm_vcpu_arch { u64 last_guest_tsc; u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; u64 last_host_tsc; u64 tsc_offset_adjustment; + u64 this_tsc_nsec; + u64 this_tsc_write; + u8 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -513,9 +514,12 @@ struct kvm_arch { s64 kvmclock_offset; raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; - u64 last_tsc_offset; u64 last_tsc_write; u32 last_tsc_khz; + u64 cur_tsc_nsec; + u64 cur_tsc_write; + u64 cur_tsc_offset; + u8 cur_tsc_generation; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e9bd23d522d..e86f9b22eaca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1013,10 +1013,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; + tsc += vcpu->arch.this_tsc_write; return tsc; } @@ -1059,7 +1059,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) if (nsdiff < NSEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); @@ -1067,20 +1067,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computaion in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_x86_ops->write_tsc_offset(vcpu, offset); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_guest_time_update(struct kvm_vcpu *v) -- cgit v1.2.3 From 10166744b80a41c30d82bc6e11140f5b28d257ab Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 7 Feb 2012 23:19:20 +0530 Subject: KVM: VMX: remove yield_on_hlt yield_on_hlt was introduced for CPU bandwidth capping. Now it is redundant with CFS hardlimit. yield_on_hlt also complicates the scenario in paravirtual environment, that needs to trap halt. for e.g. paravirtualized ticket spinlocks. Acked-by: Anthony Liguori Signed-off-by: Raghavendra K T Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 575fb742a6fc..d2bd719925a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static bool __read_mostly yield_on_hlt = 1; -module_param(yield_on_hlt, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, then - * the instruction is already executing and RIP has already been - * advanced. */ - if (!yield_on_hlt && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. @@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -2405,7 +2390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; - min = + min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -2420,9 +2405,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (yield_on_hlt) - min |= CPU_BASED_HLT_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -4009,7 +3991,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -4041,7 +4022,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vmx_clear_hlt(vcpu); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 6dbf79e7164e9a86c1e466062c48498142ae6128 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 5 Feb 2012 20:42:41 +0900 Subject: KVM: Fix write protection race during dirty logging This patch fixes a race introduced by: commit 95d4c16ce78cb6b7549a09159c409d52ddd18dae KVM: Optimize dirty logging by rmap_write_protect() During protecting pages for dirty logging, other threads may also try to protect a page in mmu_sync_children() or kvm_mmu_get_page(). In such a case, because get_dirty_log releases mmu_lock before flushing TLB's, the following race condition can happen: A (get_dirty_log) B (another thread) lock(mmu_lock) clear pte.w unlock(mmu_lock) lock(mmu_lock) pte.w is already cleared unlock(mmu_lock) skip TLB flush return ... TLB flush Though thread B assumes the page has already been protected when it returns, the remaining TLB entry will break that assumption. This patch fixes this problem by making get_dirty_log hold the mmu_lock until it flushes the TLB's. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e86f9b22eaca..3df0b7a140b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3065,6 +3065,8 @@ static void write_protect_slot(struct kvm *kvm, unsigned long *dirty_bitmap, unsigned long nr_dirty_pages) { + spin_lock(&kvm->mmu_lock); + /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { unsigned long gfn_offset; @@ -3072,16 +3074,13 @@ static void write_protect_slot(struct kvm *kvm, for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { unsigned long gfn = memslot->base_gfn + gfn_offset; - spin_lock(&kvm->mmu_lock); kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - spin_unlock(&kvm->mmu_lock); } kvm_flush_remote_tlbs(kvm); - } else { - spin_lock(&kvm->mmu_lock); + } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); - spin_unlock(&kvm->mmu_lock); - } + + spin_unlock(&kvm->mmu_lock); } /* -- cgit v1.2.3 From fb03cb6f44236f4bef62a0dda8e025ff5ca51417 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 12:59:10 +0900 Subject: KVM: Introduce gfn_to_index() which returns the index for a given level This patch cleans up the code and removes the "(void)level;" warning suppressor. Note that we can also use this for PT_PAGE_TABLE_LEVEL to treat every level uniformly later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +-- include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 7 +------ 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ae76cc3392e1..37e7f100a0e0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -688,8 +688,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, { unsigned long idx; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); + idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->lpage_info[level - 2][idx]; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9698080c902b..7a08496b974a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -681,6 +681,13 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) return gfn_to_memslot(kvm, gfn)->id; } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 470e30520fe8..415fe816fc15 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -784,15 +784,10 @@ int __kvm_set_memory_region(struct kvm *kvm, int lpages; int level = i + 2; - /* Avoid unused variable warning if no large pages */ - (void)level; - if (new.lpage_info[i]) continue; - lpages = 1 + ((base_gfn + npages - 1) - >> KVM_HPAGE_GFN_SHIFT(level)); - lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); + lpages = gfn_to_index(base_gfn + npages - 1, base_gfn, level) + 1; new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i])); -- cgit v1.2.3 From db3fe4eb45f3555d91a7124e18cf3a2f2a30eb90 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:02:18 +0900 Subject: KVM: Introduce kvm_memory_slot::arch and move lpage_info into it Some members of kvm_memory_slot are not used by every architecture. This patch is the first step to make this difference clear by introducing kvm_memory_slot::arch; lpage_info is moved into it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 ++ arch/ia64/kvm/kvm-ia64.c | 10 ++++++ arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kvm/powerpc.c | 10 ++++++ arch/s390/include/asm/kvm_host.h | 3 ++ arch/s390/kvm/kvm-s390.c | 10 ++++++ arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 11 +++--- virt/kvm/kvm_main.c | 70 +++++-------------------------------- 11 files changed, 122 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2689ee54a1c9..e35b3a84a40b 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -459,6 +459,9 @@ struct kvm_sal_data { unsigned long boot_gp; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch { spinlock_t dirty_log_lock; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8ca7261e7b3d..d8ddbba6fe7d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1571,6 +1571,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1843d5d2a3be..52eb9c1f4fe0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -213,6 +213,9 @@ struct revmap_entry { #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 +struct kvm_arch_memory_slot { +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0e21d155eea7..00d7e345b3fe 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -281,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e6304268ea28..7343872890a2 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -245,6 +245,9 @@ struct kvm_vm_stat { u32 remote_tlb_flush; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cf3c0a91d046..17ad69d596fd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -814,6 +814,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c24125cd0c63..74c9edf2bb18 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -483,6 +483,15 @@ struct kvm_vcpu_arch { } osvw; }; +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_arch_memory_slot { + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 37e7f100a0e0..ff053ca32303 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -689,7 +689,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->lpage_info[level - 2][idx]; + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3df0b7a140b0..ca74c1dadf3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6239,6 +6239,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.ept_identity_pagetable); } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { + vfree(free->arch.lpage_info[i]); + free->arch.lpage_info[i] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.lpage_info[i] = + vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + if (!slot->arch.lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->arch.lpage_info[i]); + slot->arch.lpage_info[i] = NULL; + } + return -ENOMEM; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7a08496b974a..355e44555c39 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -171,11 +171,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) -struct kvm_lpage_info { - unsigned long rmap_pde; - int write_count; -}; - struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -184,7 +179,7 @@ struct kvm_memory_slot { unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_head; unsigned long nr_dirty_pages; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; int id; @@ -376,6 +371,9 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -385,6 +383,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a30447c5eb4a..8340e0e62034 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,21 +535,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { - vfree(free->lpage_info[i]); - free->lpage_info[i] = NULL; - } - } + kvm_arch_free_memslot(free, dont); free->npages = 0; free->rmap = NULL; @@ -685,53 +677,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) slots->generation++; } -#ifndef CONFIG_S390 -static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - int lpages; - int level = i + 2; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i])); - if (!slot->lpage_info[i]) - goto out_free; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][0].write_count = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][lpages - 1].write_count = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !largepages_enabled) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - slot->lpage_info[i][j].write_count = 1; - } - } - - return 0; - -out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->lpage_info[i]); - slot->lpage_info[i] = NULL; - } - return -ENOMEM; -} -#endif /* not defined CONFIG_S390 */ - /* * Allocate some memory and give it an address in the guest physical address * space. @@ -819,10 +764,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - - if (create_lpage_info(&new, npages)) - goto out_free; #endif /* not defined CONFIG_S390 */ + if (kvm_arch_create_memslot(&new, npages)) + goto out_free; } /* Allocate page dirty bitmap if needed */ @@ -880,8 +824,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!npages) { new.rmap = NULL; new.dirty_bitmap = NULL; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) - new.lpage_info[i] = NULL; + memset(&new.arch, 0, sizeof(new.arch)); } update_memslots(slots, &new); @@ -968,6 +911,11 @@ out: return r; } +bool kvm_largepages_enabled(void) +{ + return largepages_enabled; +} + void kvm_disable_largepages(void) { largepages_enabled = false; -- cgit v1.2.3 From 270c6c79f4e15e599f47174ecedad932463af7a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 16 Feb 2012 14:44:11 +0200 Subject: KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7aad5446f393..3e48c1d3edcd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -413,7 +413,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) struct kvm_pmc *counters; u64 ctr; - pmc &= (3u << 30) - 1; + pmc &= ~(3u << 30); if (!fixed && pmc >= pmu->nr_arch_gp_counters) return 1; if (fixed && pmc >= pmu->nr_arch_fixed_counters) -- cgit v1.2.3 From 7f3d35fddd173e52886d03bc34b5b5d6f5bea343 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:38 +0100 Subject: KVM: x86 emulator: Fix task switch privilege checks Currently, all task switches check privileges against the DPL of the TSS. This is only correct for jmp/call to a TSS. If a task gate is used, the DPL of this take gate is used for the check instead. Exceptions, external interrupts and iret shouldn't perform any check. [avi: kill kvm-kmod remnants] Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/kvm/emulate.c | 53 +++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm.c | 5 +++- arch/x86/kvm/vmx.c | 8 +++--- arch/x86/kvm/x86.c | 6 ++--- 6 files changed, 61 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7b9cfc4878af..df437b68f42b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -388,7 +388,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74c9edf2bb18..e216ba066e79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -768,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 71450aca3b86..fa310a48591c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1152,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { @@ -2421,7 +2437,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2443,12 +2459,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2501,7 +2540,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; @@ -2509,7 +2548,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b7690ee20bd..95cdeaf9c718 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2799,7 +2799,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2bd719925a6..124a0952a040 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4658,9 +4658,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4698,8 +4699,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca74c1dadf3a..490a1b1a255f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5655,15 +5655,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) -- cgit v1.2.3 From 66b0ab8fac1031ffc70eb77491048339f2717a54 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:39 +0100 Subject: KVM: x86 emulator: VM86 segments must have DPL 3 Setting the segment DPL to 0 for at least the VM86 code segment makes the VM entry fail on VMX. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fa310a48591c..b19e9fffe582 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1244,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; + if (ctxt->mode == X86EMUL_MODE_VM86) + seg_desc.dpl = 3; goto load; } -- cgit v1.2.3 From ea5e97e8bf1d56a4d9461c39e082b9c31a7be4ff Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:40 +0100 Subject: KVM: SVM: Fix CPL updates Keep CPL at 0 in real mode and at 3 in VM86. In protected/long mode, use RPL rather than DPL of the code segment. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95cdeaf9c718..ab39d84dee00 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1332,6 +1332,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } +static void svm_update_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int cpl; + + if (!is_protmode(vcpu)) + cpl = 0; + else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) + cpl = 3; + else + cpl = svm->vmcb->save.cs.selector & 0x3; + + svm->vmcb->save.cpl = cpl; +} + static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1607,9 +1622,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } if (seg == VCPU_SREG_CS) - svm->vmcb->save.cpl - = (svm->vmcb->save.cs.attrib - >> SVM_SELECTOR_DPL_SHIFT) & 3; + svm_update_cpl(vcpu); mark_dirty(svm->vmcb, VMCB_SEG); } -- cgit v1.2.3 From 4cee4798a304ee1ea579423ca048f16ceaccdfb5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:41 +0100 Subject: KVM: x86 emulator: Allow PM/VM86 switch during task switch Task switches can switch between Protected Mode and VM86. The current mode must be updated during the task switch emulation so that the new segment selectors are interpreted correctly. In order to let privilege checks succeed, rflags needs to be updated in the vcpu struct as this causes a CPL update. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 31 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df437b68f42b..c222e1a1b12a 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,7 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b19e9fffe582..83756223f8aa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2344,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; + + /* General purpose registers */ ctxt->regs[VCPU_REGS_RAX] = tss->eax; ctxt->regs[VCPU_REGS_RCX] = tss->ecx; ctxt->regs[VCPU_REGS_RDX] = tss->edx; @@ -2365,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); + /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + * + * Need to get rflags to the vcpu struct immediately because it + * influences the CPL which is checked at least when loading the segment + * descriptors and when pushing an error code to the new kernel stack. + * + * TODO Introduce a separate ctxt->ops->set_cpl callback + */ + if (ctxt->eflags & X86_EFLAGS_VM) + ctxt->mode = X86EMUL_MODE_VM86; + else + ctxt->mode = X86EMUL_MODE_PROT32; + + ctxt->ops->set_rflags(ctxt, ctxt->eflags); + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab39d84dee00..53efd597f39e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1354,7 +1354,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; + if ((old_rflags ^ rflags) & X86_EFLAGS_VM) + svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 490a1b1a255f..03a1fd47a6d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4129,6 +4129,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } +static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) +{ + kvm_set_rflags(emul_to_vcpu(ctxt), val); +} + static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4310,6 +4315,7 @@ static struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, + .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, -- cgit v1.2.3 From 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2012 14:23:29 +0200 Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings If some vcpus are created before KVM_CREATE_IRQCHIP, then irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading to potential NULL pointer dereferences. Fix by: - ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called - ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP This is somewhat long winded because vcpu->arch.apic is created without kvm->lock held. Based on earlier patch by Michael Ellerman. Signed-off-by: Michael Ellerman Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/x86/kvm/x86.c | 8 ++++++++ include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 4 ++++ 4 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d8ddbba6fe7d..f5104b7c52cd 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1172,6 +1172,11 @@ out: #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03a1fd47a6d3..9477dc6cccae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3199,6 +3199,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -6107,6 +6110,11 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 355e44555c39..e42d85ae8541 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -805,6 +805,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } + +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); + +#else + +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } + #endif #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4431ada5947..94e148e38719 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1651,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); + if (!kvm_vcpu_compatible(vcpu)) { + r = -EINVAL; + goto unlock_vcpu_destroy; + } if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { r = -EINVAL; goto unlock_vcpu_destroy; -- cgit v1.2.3 From 07700a94b00a4fcbbfb07d1b72dc112a0e036735 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 28 Feb 2012 14:19:54 +0100 Subject: KVM: Allow host IRQ sharing for assigned PCI 2.3 devices PCI 2.3 allows to generically disable IRQ sources at device level. This enables us to share legacy IRQs of such devices with other host devices when passing them to a guest. The new IRQ sharing feature introduced here is optional, user space has to request it explicitly. Moreover, user space can inform us about its view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the interrupt and signaling it if the guest masked it via the virtualized PCI config space. Signed-off-by: Jan Kiszka Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 41 ++++++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 6 ++ include/linux/kvm_host.h | 2 + virt/kvm/assigned-dev.c | 209 ++++++++++++++++++++++++++++++++------ 5 files changed, 230 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 59a38264a0ed..6386f8c0482e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1169,6 +1169,14 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +/* The following two depend on KVM_CAP_PCI_2_3 */ +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) + +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. @@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. +4.60 KVM_ASSIGN_SET_INTX_MASK + +Capability: KVM_CAP_PCI_2_3 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Allows userspace to mask PCI INTx interrupts from the assigned device. The +kernel will not deliver INTx interrupts to the guest between setting and +clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of +and emulation of PCI 2.3 INTx disable command register behavior. + +This may be used for both PCI 2.3 devices supporting INTx disable natively and +older devices lacking this support. Userspace is responsible for emulating the +read value of the INTx disable bit in the guest visible PCI command register. +When modifying the INTx disable state, userspace should precede updating the +physical device command register by calling this ioctl to inform the kernel of +the new intended INTx mask state. + +Note that the kernel uses the device INTx disable bit to internally manage the +device interrupt state for PCI 2.3 devices. Reads of this register may +therefore not match the expected value. Writes should always use the guest +intended INTx disable value rather than attempting to read-copy-update the +current physical device state. Races between user and kernel updates to the +INTx disable bit are handled lazily in the kernel. It's possible the device +may generate unintended interrupts, but they will not be injected into the +guest. + +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is +evaluated. + 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9477dc6cccae..6866083a48c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_PCI_2_3: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index acbe42939089..6c322a90b92f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 #define KVM_CAP_SYNC_REGS 74 +#define KVM_CAP_PCI_2_3 75 #ifdef KVM_CAP_IRQ_ROUTING @@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_TSC_CONTROL */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) +/* Available with KVM_CAP_PCI_2_3 */ +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ + struct kvm_assigned_pci_dev) /* * ioctls for vcpu fds @@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) struct kvm_assigned_pci_dev { __u32 assigned_dev_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e42d85ae8541..ec171c1d0878 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel { unsigned int entries_nr; int host_irq; bool host_irq_disabled; + bool pci_2_3; struct msix_entry *host_msix_entries; int guest_irq; struct msix_entry *guest_msix_entries; @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; spinlock_t intx_lock; + struct mutex intx_mask_lock; char irq_name[32]; struct pci_saved_state *pci_saved_state; }; diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ece80612b594..08e05715df72 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret; - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { - spin_lock(&assigned_dev->intx_lock); + spin_lock(&assigned_dev->intx_lock); + if (pci_check_and_mask_intx(assigned_dev->dev)) { + assigned_dev->host_irq_disabled = true; + ret = IRQ_WAKE_THREAD; + } else + ret = IRQ_NONE; + spin_unlock(&assigned_dev->intx_lock); + + return ret; +} + +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int vector) +{ + if (unlikely(assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_INTX)) { + mutex_lock(&assigned_dev->intx_mask_lock); + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1); + mutex_unlock(&assigned_dev->intx_mask_lock); + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + vector, 1); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); disable_irq_nosync(irq); assigned_dev->host_irq_disabled = true; - spin_unlock(&assigned_dev->intx_lock); + spin_unlock_irq(&assigned_dev->intx_lock); } - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); return IRQ_HANDLED; } +#endif #ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) @@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) if (index >= 0) { vector = assigned_dev->guest_msix_entries[index].vector; - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); } return IRQ_HANDLED; @@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); - /* The guest irq may be shared so this ack may be - * from another device. - */ - spin_lock(&dev->intx_lock); - if (dev->host_irq_disabled) { - enable_irq(dev->host_irq); - dev->host_irq_disabled = false; + mutex_lock(&dev->intx_mask_lock); + + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { + bool reassert = false; + + spin_lock_irq(&dev->intx_lock); + /* + * The guest IRQ may be shared so this ack can come from an + * IRQ for another guest device. + */ + if (dev->host_irq_disabled) { + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) + enable_irq(dev->host_irq); + else if (!pci_check_and_unmask_intx(dev->dev)) + reassert = true; + dev->host_irq_disabled = reassert; + } + spin_unlock_irq(&dev->intx_lock); + + if (reassert) + kvm_set_irq(dev->kvm, dev->irq_source_id, + dev->guest_irq, 1); } - spin_unlock(&dev->intx_lock); + + mutex_unlock(&dev->intx_mask_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq(assigned_dev->host_irq); + if ((assigned_dev->irq_requested_type & + KVM_DEV_IRQ_HOST_INTX) && + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + pci_intx(assigned_dev->dev, false); + spin_unlock_irq(&assigned_dev->intx_lock); + synchronize_irq(assigned_dev->host_irq); + } else + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, assigned_dev); @@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) static int assigned_device_enable_host_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { + irq_handler_t irq_handler; + unsigned long flags; + dev->host_irq = dev->dev->irq; - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. + + /* + * We can only share the IRQ line with other host devices if we are + * able to disable the IRQ source at device-level - independently of + * the guest driver. Otherwise host devices may suffer from unbounded + * IRQ latencies when the guest keeps the line asserted. */ - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, dev->irq_name, dev)) + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + irq_handler = kvm_assigned_dev_intx; + flags = IRQF_SHARED; + } else { + irq_handler = NULL; + flags = IRQF_ONESHOT; + } + if (request_threaded_irq(dev->host_irq, irq_handler, + kvm_assigned_dev_thread_intx, flags, + dev->irq_name, dev)) return -EIO; + + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + spin_lock_irq(&dev->intx_lock); + pci_intx(dev->dev, true); + spin_unlock_irq(&dev->intx_lock); + } return 0; } @@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, dev)) { + if (request_threaded_irq(dev->host_irq, NULL, + kvm_assigned_dev_thread_msi, 0, + dev->irq_name, dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm, default: r = -EINVAL; } + dev->host_irq_disabled = false; if (!r) dev->irq_requested_type |= host_irq_type; @@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, { int r = -ENODEV; struct kvm_assigned_dev_kernel *match; + unsigned long irq_type; mutex_lock(&kvm->lock); @@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, if (!match) goto out; - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | + KVM_DEV_IRQ_GUEST_MASK); + r = kvm_deassign_irq(kvm, match, irq_type); out: mutex_unlock(&kvm->lock); return r; @@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, if (!match->pci_saved_state) printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", __func__, dev_name(&dev->dev)); + + if (!pci_intx_mask_supported(dev)) + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; + match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; match->host_busnr = assigned_dev->busnr; @@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->flags = assigned_dev->flags; match->dev = dev; spin_lock_init(&match->intx_lock); + mutex_init(&match->intx_mask_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; @@ -759,6 +853,55 @@ msix_entry_out: } #endif +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + r = -ENODEV; + goto out; + } + + mutex_lock(&match->intx_mask_lock); + + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { + kvm_set_irq(match->kvm, match->irq_source_id, + match->guest_irq, 0); + /* + * Masking at hardware-level is performed on demand, + * i.e. when an IRQ actually arrives at the host. + */ + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + /* + * Unmask the IRQ line if required. Unmasking at + * device level will be performed by user space. + */ + spin_lock_irq(&match->intx_lock); + if (match->host_irq_disabled) { + enable_irq(match->host_irq); + match->host_irq_disabled = false; + } + spin_unlock_irq(&match->intx_lock); + } + } + + mutex_unlock(&match->intx_mask_lock); + +out: + mutex_unlock(&kvm->lock); + return r; +} + long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { @@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif + case KVM_ASSIGN_SET_INTX_MASK: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); + break; + } default: r = -ENOTTY; break; @@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, out: return r; } - -- cgit v1.2.3 From 9ee73970c03edb68146ceb1ba2a7033c99a5e017 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Mar 2012 14:16:33 +0200 Subject: KVM: VMX: Fix delayed load of shared MSRs Shared MSRs (MSR_*STAR and related) are stored in both vmx->guest_msrs and in the CPU registers, but vmx_set_msr() only updated memory. Prior to 46199f33c2953, this didn't matter, since we called vmx_load_host_state(), which scheduled a vmx_save_host_state(), which re-synchronized the CPU state, but now we don't, so the CPU state will not be synchronized until the next exit to host userspace. This mostly affects nested vmx workloads, which play with these MSRs a lot. Fix by loading the MSR eagerly. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 124a0952a040..4a722a0b8e13 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2210,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) + kvm_set_shared_msr(msr->index, msr->data, + msr->mask); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); -- cgit v1.2.3 From a7b9d2ccc3d86303ee9314612d301966e04011c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:40 +0200 Subject: KVM: PMU: warn when pin control is set in eventsel msr Print warning once if pin control bit is set in eventsel msr since emulation does not support it yet. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/perf_event.h | 1 + arch/x86/kvm/pmu.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..f1f71823f682 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -23,6 +23,7 @@ #define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) #define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19) #define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) #define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3e48c1d3edcd..6af9a542e541 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -210,6 +210,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + pmc->eventsel = eventsel; stop_counter(pmc); -- cgit v1.2.3 From fac3368310765ade6bbdf07c9acdb04210e8b5b0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:41 +0200 Subject: KVM: PMU: Fix raw event check If eventsel has EDGE, INV or CMASK set we should create raw counter for it, but the check is done on a wrong variable. Fix it. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 6af9a542e541..b52a8ed283b2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -223,7 +223,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | ARCH_PERFMON_EVENTSEL_CMASK))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, -- cgit v1.2.3 From 62079d8a431287a4da81db64e002c71f0e06ca83 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:42 +0200 Subject: KVM: PMU: add proper support for fixed counter 2 Currently pmu emulation emulates fixed counter 2 as bus cycles architectural counter, but since commit 9c1497ea591b25d perf has pseudo encoding for it. Use it. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b52a8ed283b2..a73f0c104813 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 2}; +int fixed_pmc_events[] = {1, 0, 7}; static bool pmc_is_gp(struct kvm_pmc *pmc) { -- cgit v1.2.3 From 4d6931c380a976753f7566a96b58690010ef1413 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 5 Mar 2012 16:53:06 +0100 Subject: KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask The reset_rsvds_bits_mask() function can use the guest walker's root level number instead of using a separate 'level' variable. Signed-off-by: Davidlohr Bueso Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff053ca32303..4cb164268846 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3185,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, #undef PTTYPE static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) + struct kvm_mmu *context) { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); - switch (level) { + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; @@ -3251,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { context->nx = is_nx(vcpu); + context->root_level = level; - reset_rsvds_bits_mask(vcpu, context, level); + reset_rsvds_bits_mask(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3262,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->free = paging_free; - context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3279,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { context->nx = false; + context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3289,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; - context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3327,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else { context->nx = false; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging32_gva_to_gpa; } return 0; @@ -3402,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); g_context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); g_context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { g_context->nx = false; - reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); g_context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } -- cgit v1.2.3 From a223c313cb13e9ab71051fc5b70610a2829a4082 Mon Sep 17 00:00:00 2001 From: Nicolae Mogoreanu Date: Tue, 21 Feb 2012 13:44:21 -0800 Subject: KVM: Ignore the writes to MSR_K7_HWCR(3) When CPUID Fn8000_0001_EAX reports 0x00100f22 Windows 7 x64 guest tries to set bit 3 in MSRC001_0015 in nt!KiDisableCacheErrataSource and fails. This patch will ignore this step and allow things to move on without having to fake CPUID value. Signed-off-by: Nicolae Mogoreanu Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6866083a48c1..32096cf6c6c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1547,6 +1547,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); -- cgit v1.2.3 From 9587190107d0c0cbaccbf7bf6b0245d29095a9ae Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Tue, 6 Mar 2012 16:39:22 +0200 Subject: KVM: nVMX: Fix erroneous exception bitmap check The code which checks whether to inject a pagefault to L1 or L2 (in nested VMX) was wrong, incorrect in how it checked the PF_VECTOR bit. Thanks to Dan Carpenter for spotting this. Signed-off-by: Nadav Har'El Reported-by: Dan Carpenter Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4a722a0b8e13..2c22fc788da2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1664,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & PF_VECTOR)) + if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) return 0; nested_vmx_vmexit(vcpu); -- cgit v1.2.3 From bfcfaa77bdf0f775263e906015982a608df01c76 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 11:16:17 -0800 Subject: vfs: use 'unsigned long' accesses for dcache name comparison and hashing Ok, this is hacky, and only works on little-endian machines with goo unaligned handling. And even then only with CONFIG_DEBUG_PAGEALLOC disabled, since it can access up to 7 bytes after the pathname. But it runs like a bat out of hell. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + fs/Kconfig | 4 ++ fs/dcache.c | 23 +++++++++++ fs/namei.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..09675d3e0ac3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP + select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9e..aa195265362f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@ menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + if BLOCK source "fs/ext2/Kconfig" diff --git a/fs/dcache.c b/fs/dcache.c index bcbdb33fcc20..ffd47a16d870 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, static inline int dentry_cmp(const unsigned char *cs, size_t scount, const unsigned char *ct, size_t tcount) { +#ifdef CONFIG_DCACHE_WORD_ACCESS + unsigned long a,b,mask; + + if (unlikely(scount != tcount)) + return 1; + + for (;;) { + a = *(unsigned long *)cs; + b = *(unsigned long *)ct; + if (tcount < sizeof(unsigned long)) + break; + if (unlikely(a != b)) + return 1; + cs += sizeof(unsigned long); + ct += sizeof(unsigned long); + tcount -= sizeof(unsigned long); + if (!tcount) + return 0; + } + mask = ~(~0ul << tcount*8); + return unlikely(!!((a ^ b) & mask)); +#else if (scount != tcount) return 1; @@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, tcount--; } while (tcount); return 0; +#endif } static void __d_free(struct rcu_head *head) diff --git a/fs/namei.c b/fs/namei.c index e2ba62820a0f..378497a744b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode) return 1; } +/* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + * do a "get_unaligned()" if this helps and is sufficiently + * fast. + * + * - Little-endian machines (so that we can generate the mask + * of low bytes efficiently). Again, we *could* do a byte + * swapping load on big-endian architectures if that is not + * expensive enough to make the optimization worthless. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + * do not trap on the (extremely unlikely) case of a page + * crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + * 64-bit case in order to generate the "number of bytes in + * the final mask". Again, that could be replaced with a + * efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608 >> 56; +} + +static inline unsigned int fold_hash(unsigned long hash) +{ + hash += hash >> (8*sizeof(int)); + return hash; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long a, mask; + unsigned long hash = 0; + + for (;;) { + a = *(unsigned long *)name; + hash *= 9; + if (len < sizeof(unsigned long)) + break; + hash += a; + name += sizeof(unsigned long); + len -= sizeof(unsigned long); + if (!len) + goto done; + } + mask = ~(~0ul << len*8); + hash += mask & a; +done: + return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +#define ONEBYTES 0x0101010101010101ul +#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful +#define HIGHBITS 0x8080808080808080ul + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - ONEBYTES) & ~a) & HIGHBITS; +} + +/* + * Calculate the length and hash of the path component, and + * return the length of the component; + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long a, mask, hash, len; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = (hash + a) * 9; + len += sizeof(unsigned long); + a = *(unsigned long *)(name+len); + /* Do we have any NUL or '/' bytes in this word? */ + mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + } while (!mask); + + /* The mask *below* the first high bit set */ + mask = (mask - 1) & ~mask; + mask >>= 7; + hash += a & mask; + *hashp = fold_hash(hash); + + return len + count_masked_bytes(mask); +} + +#else + unsigned int full_name_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); @@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) return len; } +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into -- cgit v1.2.3 From a7f4255f906f60f72e00aad2fb000939449ff32e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Mar 2012 20:55:10 +0100 Subject: x86: Derandom delay_tsc for 64 bit Commit f0fbf0abc093 ("x86: integrate delay functions") converted delay_tsc() into a random delay generator for 64 bit. The reason is that it merged the mostly identical versions of delay_32.c and delay_64.c. Though the subtle difference of the result was: static void delay_tsc(unsigned long loops) { - unsigned bclock, now; + unsigned long bclock, now; Now the function uses rdtscl() which returns the lower 32bit of the TSC. On 32bit that's not problematic as unsigned long is 32bit. On 64 bit this fails when the lower 32bit are close to wrap around when bclock is read, because the following check if ((now - bclock) >= loops) break; evaluated to true on 64bit for e.g. bclock = 0xffffffff and now = 0 because the unsigned long (now - bclock) of these values results in 0xffffffff00000001 which is definitely larger than the loops value. That explains Tvortkos observation: "Because I am seeing udelay(500) (_occasionally_) being short, and that by delaying for some duration between 0us (yep) and 491us." Make those variables explicitely u32 again, so this works for both 32 and 64 bit. Reported-by: Tvrtko Ursulin Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org # >= 2.6.27 Signed-off-by: Linus Torvalds --- arch/x86/lib/delay.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index fc45ba887d05..e395693abdb1 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -48,9 +48,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static void delay_tsc(unsigned long __loops) { - unsigned long bclock, now; + u32 bclock, now, loops = __loops; int cpu; preempt_disable(); -- cgit v1.2.3 From c94082656dac74257f63e91f78d5d458ac781fa5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 9 Mar 2012 16:07:10 -0800 Subject: x86: Use enum instead of literals for trap values The traps are referred to by their numbers and it can be difficult to understand them while reading the code without context. This patch adds enumeration of the trap numbers and replaces the numbers with the correct enum for x86. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20120310000710.GA32667@www.outflux.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/traps.h | 25 +++++++++ arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 123 +++++++++++++++++++++++-------------------- 3 files changed, 91 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0012d0902c5f..88eae2aec619 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -89,4 +89,29 @@ asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +/* Interrupts/Exceptions */ +enum { + X86_TRAP_DE = 0, /* 0, Divide-by-zero */ + X86_TRAP_DB, /* 1, Debug */ + X86_TRAP_NMI, /* 2, Non-maskable Interrupt */ + X86_TRAP_BP, /* 3, Breakpoint */ + X86_TRAP_OF, /* 4, Overflow */ + X86_TRAP_BR, /* 5, Bound Range Exceeded */ + X86_TRAP_UD, /* 6, Invalid Opcode */ + X86_TRAP_NM, /* 7, Device Not Available */ + X86_TRAP_DF, /* 8, Double Fault */ + X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */ + X86_TRAP_TS, /* 10, Invalid TSS */ + X86_TRAP_NP, /* 11, Segment Not Present */ + X86_TRAP_SS, /* 12, Stack Segment Fault */ + X86_TRAP_GP, /* 13, General Protection Fault */ + X86_TRAP_PF, /* 14, Page Fault */ + X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */ + X86_TRAP_MF, /* 16, x87 Floating-Point Exception */ + X86_TRAP_AC, /* 17, Alignment Check */ + X86_TRAP_MC, /* 18, Machine Check */ + X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */ + X86_TRAP_IRET = 32, /* 32, IRET Exception */ +}; + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..7b77062dea11 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,7 +61,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0, 0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error(get_irq_regs(), 0, 16); + math_error(get_irq_regs(), 0, X86_TRAP_MF); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d96744..037fc2bc5316 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -119,7 +119,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ - if (trapnr < 6) + if (trapnr < X86_TRAP_UD) goto vm86_trap; goto trap_signal; } @@ -203,27 +203,31 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, + regs->ip) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, + regs->ip) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", + coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) #ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) #endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, + BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) + X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) return; preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); preempt_conditional_cli(regs); } @@ -233,10 +237,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) struct task_struct *tsk = current; /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; + tsk->thread.trap_no = X86_TRAP_DF; /* * This is always a kernel trap and never fixable (and thus must @@ -264,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code) goto gp_in_kernel; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; + tsk->thread.trap_no = X86_TRAP_GP; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { @@ -291,9 +295,9 @@ gp_in_kernel: return; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) + tsk->thread.trap_no = X86_TRAP_GP; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) return; die("general protection fault", regs, error_code); } @@ -302,13 +306,13 @@ gp_in_kernel: dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) + if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) return; /* @@ -317,7 +321,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) */ debug_stack_usage_inc(); preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); } @@ -422,8 +426,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_sti(regs); if (regs->flags & X86_VM_MASK) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, 1); + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, + X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); return; @@ -460,7 +464,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) struct task_struct *task = current; siginfo_t info; unsigned short err; - char *str = (trapnr == 16) ? "fpu exception" : "simd exception"; + char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : + "simd exception"; if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; @@ -485,7 +490,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) info.si_signo = SIGFPE; info.si_errno = 0; info.si_addr = (void __user *)regs->ip; - if (trapnr == 16) { + if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -529,10 +534,11 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) info.si_code = FPE_FLTRES; } else { /* - * If we're using IRQ 13, or supposedly even some trap 16 - * implementations, it's possible we get a spurious trap... + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. */ - return; /* Spurious trap, no error */ + return; } force_sig_info(SIGFPE, &info, task); } @@ -543,13 +549,13 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) ignore_fpu_irq = 1; #endif - math_error(regs, error_code, 16); + math_error(regs, error_code, X86_TRAP_MF); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - math_error(regs, error_code, 19); + math_error(regs, error_code, X86_TRAP_XF); } dotraplinkage void @@ -643,20 +649,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_errno = 0; info.si_code = ILL_BADSTK; info.si_addr = NULL; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); } #endif /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - set_intr_gate(14, &page_fault); + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_intr_gate(X86_TRAP_PF, &page_fault); load_idt(&idt_descr); } @@ -672,30 +679,30 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_intr_gate(0, ÷_error); - set_intr_gate_ist(2, &nmi, NMI_STACK); + set_intr_gate(X86_TRAP_DE, ÷_error); + set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); + set_system_intr_gate(X86_TRAP_OF, &overflow); + set_intr_gate(X86_TRAP_BR, &bounds); + set_intr_gate(X86_TRAP_UD, &invalid_op); + set_intr_gate(X86_TRAP_NM, &device_not_available); #ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); #else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); #endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); + set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); + set_intr_gate(X86_TRAP_TS, &invalid_TSS); + set_intr_gate(X86_TRAP_NP, &segment_not_present); + set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_GP, &general_protection); + set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); + set_intr_gate(X86_TRAP_MF, &coprocessor_error); + set_intr_gate(X86_TRAP_AC, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); + set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); #endif - set_intr_gate(19, &simd_coprocessor_error); + set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) @@ -720,7 +727,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_64 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(1, &debug); - set_nmi_gate(3, &int3); + set_nmi_gate(X86_TRAP_DB, &debug); + set_nmi_gate(X86_TRAP_BP, &int3); #endif } -- cgit v1.2.3 From 026abc333205c1fff80138b8c2cac3d0347685f4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 8 Mar 2012 16:02:20 +0000 Subject: gma500: initial medfield merge We need to merge this ahead of some of the cleanup because a lot of needed cleanup spans both new and old chips. If we try and clean up and the merge we end up fighting ourselves. Signed-off-by: Kirill A. Shutemov [With a load of the cleanup stuff folded in, register stuff reworked sanely] Signed-off-by: Alan Cox Signed-off-by: Dave Airlie --- arch/x86/platform/mrst/mrst.c | 16 + drivers/gpu/drm/gma500/Kconfig | 7 + drivers/gpu/drm/gma500/Makefile | 10 + drivers/gpu/drm/gma500/mdfld_device.c | 691 ++++++++++++++ drivers/gpu/drm/gma500/mdfld_dsi_dpi.c | 1024 +++++++++++++++++++++ drivers/gpu/drm/gma500/mdfld_dsi_dpi.h | 79 ++ drivers/gpu/drm/gma500/mdfld_dsi_output.c | 635 +++++++++++++ drivers/gpu/drm/gma500/mdfld_dsi_output.h | 389 ++++++++ drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c | 694 ++++++++++++++ drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h | 92 ++ drivers/gpu/drm/gma500/mdfld_intel_display.c | 1192 +++++++++++++++++++++++++ drivers/gpu/drm/gma500/mdfld_output.c | 77 ++ drivers/gpu/drm/gma500/mdfld_output.h | 77 ++ drivers/gpu/drm/gma500/mdfld_tmd_vid.c | 201 +++++ drivers/gpu/drm/gma500/mdfld_tpo_vid.c | 124 +++ drivers/gpu/drm/gma500/psb_drv.c | 10 + drivers/gpu/drm/gma500/psb_drv.h | 89 ++ drivers/gpu/drm/gma500/psb_irq.c | 58 ++ drivers/gpu/drm/gma500/psb_irq.h | 2 + drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c | 829 +++++++++++++++++ drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h | 38 + include/linux/i2c/tc35876x.h | 11 + 22 files changed, 6345 insertions(+) create mode 100644 drivers/gpu/drm/gma500/mdfld_device.c create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_dpi.c create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_dpi.h create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_output.c create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_output.h create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c create mode 100644 drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h create mode 100644 drivers/gpu/drm/gma500/mdfld_intel_display.c create mode 100644 drivers/gpu/drm/gma500/mdfld_output.c create mode 100644 drivers/gpu/drm/gma500/mdfld_output.h create mode 100644 drivers/gpu/drm/gma500/mdfld_tmd_vid.c create mode 100644 drivers/gpu/drm/gma500/mdfld_tpo_vid.c create mode 100644 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c create mode 100644 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h create mode 100644 include/linux/i2c/tc35876x.h (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 475e2cd0f3c3..b930cc43a235 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -686,6 +688,19 @@ static void *msic_ocd_platform_data(void *info) return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); } +/* tc35876x DSI-LVDS bridge chip and panel platform data */ +static void *tc35876x_platform_data(void *data) +{ + static struct tc35876x_platform_data pdata; + + /* gpio pins set to -1 will not be used by the driver */ + pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN"); + pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN"); + pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3"); + + return &pdata; +} + static const struct devs_id __initconst device_ids[] = { {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, @@ -698,6 +713,7 @@ static const struct devs_id __initconst device_ids[] = { {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data}, + {"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data}, /* MSIC subdevices */ {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, diff --git a/drivers/gpu/drm/gma500/Kconfig b/drivers/gpu/drm/gma500/Kconfig index f92a7f4208d1..42e665c7e90a 100644 --- a/drivers/gpu/drm/gma500/Kconfig +++ b/drivers/gpu/drm/gma500/Kconfig @@ -24,3 +24,10 @@ config DRM_GMA3600 help Say yes to include basic support for Intel GMA3600/3650 (Intel Cedar Trail) platforms. + +config DRM_MEDFIELD + bool "Intel Medfield support (Experimental)" + depends on DRM_GMA500 && X86_INTEL_MID + help + Say yes to include support for the Intel Medfield platform. + diff --git a/drivers/gpu/drm/gma500/Makefile b/drivers/gpu/drm/gma500/Makefile index 81c103be5e21..1583982917ce 100644 --- a/drivers/gpu/drm/gma500/Makefile +++ b/drivers/gpu/drm/gma500/Makefile @@ -37,4 +37,14 @@ gma500_gfx-$(CONFIG_DRM_GMA600) += oaktrail_device.o \ oaktrail_hdmi.o \ oaktrail_hdmi_i2c.o +gma500_gfx-$(CONFIG_DRM_MEDFIELD) += mdfld_device.o \ + mdfld_output.o \ + mdfld_intel_display.o \ + mdfld_dsi_output.o \ + mdfld_dsi_dpi.o \ + mdfld_dsi_pkg_sender.o \ + mdfld_tpo_vid.o \ + mdfld_tmd_vid.o \ + tc35876x-dsi-lvds.o + obj-$(CONFIG_DRM_GMA500) += gma500_gfx.o diff --git a/drivers/gpu/drm/gma500/mdfld_device.c b/drivers/gpu/drm/gma500/mdfld_device.c new file mode 100644 index 000000000000..6cfdda90eef1 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_device.c @@ -0,0 +1,691 @@ +/************************************************************************** + * Copyright (c) 2011, Intel Corporation. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + **************************************************************************/ + +#include "psb_drv.h" +#include "mid_bios.h" +#include "mdfld_output.h" +#include "mdfld_dsi_output.h" +#include "tc35876x-dsi-lvds.h" + +#include + +#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE + +#define MRST_BLC_MAX_PWM_REG_FREQ 0xFFFF +#define BLC_PWM_PRECISION_FACTOR 100 /* 10000000 */ +#define BLC_PWM_FREQ_CALC_CONSTANT 32 +#define MHz 1000000 +#define BRIGHTNESS_MIN_LEVEL 1 +#define BRIGHTNESS_MAX_LEVEL 100 +#define BRIGHTNESS_MASK 0xFF +#define BLC_POLARITY_NORMAL 0 +#define BLC_POLARITY_INVERSE 1 +#define BLC_ADJUSTMENT_MAX 100 + +#define MDFLD_BLC_PWM_PRECISION_FACTOR 10 +#define MDFLD_BLC_MAX_PWM_REG_FREQ 0xFFFE +#define MDFLD_BLC_MIN_PWM_REG_FREQ 0x2 + +#define MDFLD_BACKLIGHT_PWM_POLARITY_BIT_CLEAR (0xFFFE) +#define MDFLD_BACKLIGHT_PWM_CTL_SHIFT (16) + +static struct backlight_device *mdfld_backlight_device; + +int mdfld_set_brightness(struct backlight_device *bd) +{ + struct drm_device *dev = + (struct drm_device *)bl_get_data(mdfld_backlight_device); + struct drm_psb_private *dev_priv = dev->dev_private; + int level = bd->props.brightness; + + DRM_DEBUG_DRIVER("backlight level set to %d\n", level); + + /* Perform value bounds checking */ + if (level < BRIGHTNESS_MIN_LEVEL) + level = BRIGHTNESS_MIN_LEVEL; + + if (gma_power_begin(dev, false)) { + u32 adjusted_level = 0; + + /* + * Adjust the backlight level with the percent in + * dev_priv->blc_adj2 + */ + adjusted_level = level * dev_priv->blc_adj2; + adjusted_level = adjusted_level / BLC_ADJUSTMENT_MAX; + dev_priv->brightness_adjusted = adjusted_level; + + if (mdfld_get_panel_type(dev, 0) == TC35876X) { + if (dev_priv->dpi_panel_on[0] || + dev_priv->dpi_panel_on[2]) + tc35876x_brightness_control(dev, + dev_priv->brightness_adjusted); + } else { + if (dev_priv->dpi_panel_on[0]) + mdfld_dsi_brightness_control(dev, 0, + dev_priv->brightness_adjusted); + } + + if (dev_priv->dpi_panel_on[2]) + mdfld_dsi_brightness_control(dev, 2, + dev_priv->brightness_adjusted); + gma_power_end(dev); + } + + /* cache the brightness for later use */ + dev_priv->brightness = level; + return 0; +} + +int mdfld_get_brightness(struct backlight_device *bd) +{ + struct drm_device *dev = + (struct drm_device *)bl_get_data(mdfld_backlight_device); + struct drm_psb_private *dev_priv = dev->dev_private; + + DRM_DEBUG_DRIVER("brightness = 0x%x \n", dev_priv->brightness); + + /* return locally cached var instead of HW read (due to DPST etc.) */ + return dev_priv->brightness; +} + +static const struct backlight_ops mdfld_ops = { + .get_brightness = mdfld_get_brightness, + .update_status = mdfld_set_brightness, +}; + +static int device_backlight_init(struct drm_device *dev) +{ + struct drm_psb_private *dev_priv = (struct drm_psb_private *) + dev->dev_private; + + dev_priv->blc_adj1 = BLC_ADJUSTMENT_MAX; + dev_priv->blc_adj2 = BLC_ADJUSTMENT_MAX; + + return 0; +} + +int mdfld_backlight_init(struct drm_device *dev) +{ + struct backlight_properties props; + int ret = 0; + + memset(&props, 0, sizeof(struct backlight_properties)); + props.max_brightness = BRIGHTNESS_MAX_LEVEL; + props.type = BACKLIGHT_PLATFORM; + mdfld_backlight_device = backlight_device_register("mdfld-bl", + NULL, (void *)dev, &mdfld_ops, &props); + + if (IS_ERR(mdfld_backlight_device)) + return PTR_ERR(mdfld_backlight_device); + + ret = device_backlight_init(dev); + if (ret) + return ret; + + mdfld_backlight_device->props.brightness = BRIGHTNESS_MAX_LEVEL; + mdfld_backlight_device->props.max_brightness = BRIGHTNESS_MAX_LEVEL; + backlight_update_status(mdfld_backlight_device); + return 0; +} +#endif + +struct backlight_device *mdfld_get_backlight_device(void) +{ +#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE + return mdfld_backlight_device; +#else + return NULL; +#endif +} + +/* + * mdfld_save_display_registers + * + * Description: We are going to suspend so save current display + * register state. + * + * Notes: FIXME_JLIU7 need to add the support for DPI MIPI & HDMI audio + */ +static int mdfld_save_display_registers(struct drm_device *dev, int pipe) +{ + struct drm_psb_private *dev_priv = dev->dev_private; + struct medfield_state *regs = &dev_priv->regs.mdfld; + int i; + + /* register */ + u32 dpll_reg = MRST_DPLL_A; + u32 fp_reg = MRST_FPA0; + u32 pipeconf_reg = PIPEACONF; + u32 htot_reg = HTOTAL_A; + u32 hblank_reg = HBLANK_A; + u32 hsync_reg = HSYNC_A; + u32 vtot_reg = VTOTAL_A; + u32 vblank_reg = VBLANK_A; + u32 vsync_reg = VSYNC_A; + u32 pipesrc_reg = PIPEASRC; + u32 dspstride_reg = DSPASTRIDE; + u32 dsplinoff_reg = DSPALINOFF; + u32 dsptileoff_reg = DSPATILEOFF; + u32 dspsize_reg = DSPASIZE; + u32 dsppos_reg = DSPAPOS; + u32 dspsurf_reg = DSPASURF; + u32 mipi_reg = MIPI; + u32 dspcntr_reg = DSPACNTR; + u32 dspstatus_reg = PIPEASTAT; + u32 palette_reg = PALETTE_A; + + /* pointer to values */ + u32 *dpll_val = ®s->saveDPLL_A; + u32 *fp_val = ®s->saveFPA0; + u32 *pipeconf_val = ®s->savePIPEACONF; + u32 *htot_val = ®s->saveHTOTAL_A; + u32 *hblank_val = ®s->saveHBLANK_A; + u32 *hsync_val = ®s->saveHSYNC_A; + u32 *vtot_val = ®s->saveVTOTAL_A; + u32 *vblank_val = ®s->saveVBLANK_A; + u32 *vsync_val = ®s->saveVSYNC_A; + u32 *pipesrc_val = ®s->savePIPEASRC; + u32 *dspstride_val = ®s->saveDSPASTRIDE; + u32 *dsplinoff_val = ®s->saveDSPALINOFF; + u32 *dsptileoff_val = ®s->saveDSPATILEOFF; + u32 *dspsize_val = ®s->saveDSPASIZE; + u32 *dsppos_val = ®s->saveDSPAPOS; + u32 *dspsurf_val = ®s->saveDSPASURF; + u32 *mipi_val = ®s->saveMIPI; + u32 *dspcntr_val = ®s->saveDSPACNTR; + u32 *dspstatus_val = ®s->saveDSPASTATUS; + u32 *palette_val = regs->save_palette_a; + + switch (pipe) { + case 0: + break; + case 1: + /* regester */ + dpll_reg = MDFLD_DPLL_B; + fp_reg = MDFLD_DPLL_DIV0; + pipeconf_reg = PIPEBCONF; + htot_reg = HTOTAL_B; + hblank_reg = HBLANK_B; + hsync_reg = HSYNC_B; + vtot_reg = VTOTAL_B; + vblank_reg = VBLANK_B; + vsync_reg = VSYNC_B; + pipesrc_reg = PIPEBSRC; + dspstride_reg = DSPBSTRIDE; + dsplinoff_reg = DSPBLINOFF; + dsptileoff_reg = DSPBTILEOFF; + dspsize_reg = DSPBSIZE; + dsppos_reg = DSPBPOS; + dspsurf_reg = DSPBSURF; + dspcntr_reg = DSPBCNTR; + dspstatus_reg = PIPEBSTAT; + palette_reg = PALETTE_B; + + /* values */ + dpll_val = ®s->saveDPLL_B; + fp_val = ®s->saveFPB0; + pipeconf_val = ®s->savePIPEBCONF; + htot_val = ®s->saveHTOTAL_B; + hblank_val = ®s->saveHBLANK_B; + hsync_val = ®s->saveHSYNC_B; + vtot_val = ®s->saveVTOTAL_B; + vblank_val = ®s->saveVBLANK_B; + vsync_val = ®s->saveVSYNC_B; + pipesrc_val = ®s->savePIPEBSRC; + dspstride_val = ®s->saveDSPBSTRIDE; + dsplinoff_val = ®s->saveDSPBLINOFF; + dsptileoff_val = ®s->saveDSPBTILEOFF; + dspsize_val = ®s->saveDSPBSIZE; + dsppos_val = ®s->saveDSPBPOS; + dspsurf_val = ®s->saveDSPBSURF; + dspcntr_val = ®s->saveDSPBCNTR; + dspstatus_val = ®s->saveDSPBSTATUS; + palette_val = regs->save_palette_b; + break; + case 2: + /* register */ + pipeconf_reg = PIPECCONF; + htot_reg = HTOTAL_C; + hblank_reg = HBLANK_C; + hsync_reg = HSYNC_C; + vtot_reg = VTOTAL_C; + vblank_reg = VBLANK_C; + vsync_reg = VSYNC_C; + pipesrc_reg = PIPECSRC; + dspstride_reg = DSPCSTRIDE; + dsplinoff_reg = DSPCLINOFF; + dsptileoff_reg = DSPCTILEOFF; + dspsize_reg = DSPCSIZE; + dsppos_reg = DSPCPOS; + dspsurf_reg = DSPCSURF; + mipi_reg = MIPI_C; + dspcntr_reg = DSPCCNTR; + dspstatus_reg = PIPECSTAT; + palette_reg = PALETTE_C; + + /* pointer to values */ + pipeconf_val = ®s->savePIPECCONF; + htot_val = ®s->saveHTOTAL_C; + hblank_val = ®s->saveHBLANK_C; + hsync_val = ®s->saveHSYNC_C; + vtot_val = ®s->saveVTOTAL_C; + vblank_val = ®s->saveVBLANK_C; + vsync_val = ®s->saveVSYNC_C; + pipesrc_val = ®s->savePIPECSRC; + dspstride_val = ®s->saveDSPCSTRIDE; + dsplinoff_val = ®s->saveDSPCLINOFF; + dsptileoff_val = ®s->saveDSPCTILEOFF; + dspsize_val = ®s->saveDSPCSIZE; + dsppos_val = ®s->saveDSPCPOS; + dspsurf_val = ®s->saveDSPCSURF; + mipi_val = ®s->saveMIPI_C; + dspcntr_val = ®s->saveDSPCCNTR; + dspstatus_val = ®s->saveDSPCSTATUS; + palette_val = regs->save_palette_c; + break; + default: + DRM_ERROR("%s, invalid pipe number.\n", __func__); + return -EINVAL; + } + + /* Pipe & plane A info */ + *dpll_val = PSB_RVDC32(dpll_reg); + *fp_val = PSB_RVDC32(fp_reg); + *pipeconf_val = PSB_RVDC32(pipeconf_reg); + *htot_val = PSB_RVDC32(htot_reg); + *hblank_val = PSB_RVDC32(hblank_reg); + *hsync_val = PSB_RVDC32(hsync_reg); + *vtot_val = PSB_RVDC32(vtot_reg); + *vblank_val = PSB_RVDC32(vblank_reg); + *vsync_val = PSB_RVDC32(vsync_reg); + *pipesrc_val = PSB_RVDC32(pipesrc_reg); + *dspstride_val = PSB_RVDC32(dspstride_reg); + *dsplinoff_val = PSB_RVDC32(dsplinoff_reg); + *dsptileoff_val = PSB_RVDC32(dsptileoff_reg); + *dspsize_val = PSB_RVDC32(dspsize_reg); + *dsppos_val = PSB_RVDC32(dsppos_reg); + *dspsurf_val = PSB_RVDC32(dspsurf_reg); + *dspcntr_val = PSB_RVDC32(dspcntr_reg); + *dspstatus_val = PSB_RVDC32(dspstatus_reg); + + /*save palette (gamma) */ + for (i = 0; i < 256; i++) + palette_val[i] = PSB_RVDC32(palette_reg + (i << 2)); + + if (pipe == 1) { + regs->savePFIT_CONTROL = PSB_RVDC32(PFIT_CONTROL); + regs->savePFIT_PGM_RATIOS = PSB_RVDC32(PFIT_PGM_RATIOS); + + regs->saveHDMIPHYMISCCTL = PSB_RVDC32(HDMIPHYMISCCTL); + regs->saveHDMIB_CONTROL = PSB_RVDC32(HDMIB_CONTROL); + return 0; + } + + *mipi_val = PSB_RVDC32(mipi_reg); + return 0; +} + +/* + * mdfld_restore_display_registers + * + * Description: We are going to resume so restore display register state. + * + * Notes: FIXME_JLIU7 need to add the support for DPI MIPI & HDMI audio + */ +static int mdfld_restore_display_registers(struct drm_device *dev, int pipe) +{ + /* To get panel out of ULPS mode. */ + u32 temp = 0; + u32 device_ready_reg = DEVICE_READY_REG; + struct drm_psb_private *dev_priv = dev->dev_private; + struct mdfld_dsi_config *dsi_config = NULL; + struct medfield_state *regs = &dev_priv->regs.mdfld; + u32 i = 0; + u32 dpll = 0; + u32 timeout = 0; + + /* regester */ + u32 dpll_reg = MRST_DPLL_A; + u32 fp_reg = MRST_FPA0; + u32 pipeconf_reg = PIPEACONF; + u32 htot_reg = HTOTAL_A; + u32 hblank_reg = HBLANK_A; + u32 hsync_reg = HSYNC_A; + u32 vtot_reg = VTOTAL_A; + u32 vblank_reg = VBLANK_A; + u32 vsync_reg = VSYNC_A; + u32 pipesrc_reg = PIPEASRC; + u32 dspstride_reg = DSPASTRIDE; + u32 dsplinoff_reg = DSPALINOFF; + u32 dsptileoff_reg = DSPATILEOFF; + u32 dspsize_reg = DSPASIZE; + u32 dsppos_reg = DSPAPOS; + u32 dspsurf_reg = DSPASURF; + u32 dspstatus_reg = PIPEASTAT; + u32 mipi_reg = MIPI; + u32 dspcntr_reg = DSPACNTR; + u32 palette_reg = PALETTE_A; + + /* values */ + u32 dpll_val = regs->saveDPLL_A & ~DPLL_VCO_ENABLE; + u32 fp_val = regs->saveFPA0; + u32 pipeconf_val = regs->savePIPEACONF; + u32 htot_val = regs->saveHTOTAL_A; + u32 hblank_val = regs->saveHBLANK_A; + u32 hsync_val = regs->saveHSYNC_A; + u32 vtot_val = regs->saveVTOTAL_A; + u32 vblank_val = regs->saveVBLANK_A; + u32 vsync_val = regs->saveVSYNC_A; + u32 pipesrc_val = regs->savePIPEASRC; + u32 dspstride_val = regs->saveDSPASTRIDE; + u32 dsplinoff_val = regs->saveDSPALINOFF; + u32 dsptileoff_val = regs->saveDSPATILEOFF; + u32 dspsize_val = regs->saveDSPASIZE; + u32 dsppos_val = regs->saveDSPAPOS; + u32 dspsurf_val = regs->saveDSPASURF; + u32 dspstatus_val = regs->saveDSPASTATUS; + u32 mipi_val = regs->saveMIPI; + u32 dspcntr_val = regs->saveDSPACNTR; + u32 *palette_val = regs->save_palette_a; + + switch (pipe) { + case 0: + dsi_config = dev_priv->dsi_configs[0]; + break; + case 1: + /* regester */ + dpll_reg = MDFLD_DPLL_B; + fp_reg = MDFLD_DPLL_DIV0; + pipeconf_reg = PIPEBCONF; + htot_reg = HTOTAL_B; + hblank_reg = HBLANK_B; + hsync_reg = HSYNC_B; + vtot_reg = VTOTAL_B; + vblank_reg = VBLANK_B; + vsync_reg = VSYNC_B; + pipesrc_reg = PIPEBSRC; + dspstride_reg = DSPBSTRIDE; + dsplinoff_reg = DSPBLINOFF; + dsptileoff_reg = DSPBTILEOFF; + dspsize_reg = DSPBSIZE; + dsppos_reg = DSPBPOS; + dspsurf_reg = DSPBSURF; + dspcntr_reg = DSPBCNTR; + dspstatus_reg = PIPEBSTAT; + palette_reg = PALETTE_B; + + /* values */ + dpll_val = regs->saveDPLL_B & ~DPLL_VCO_ENABLE; + fp_val = regs->saveFPB0; + pipeconf_val = regs->savePIPEBCONF; + htot_val = regs->saveHTOTAL_B; + hblank_val = regs->saveHBLANK_B; + hsync_val = regs->saveHSYNC_B; + vtot_val = regs->saveVTOTAL_B; + vblank_val = regs->saveVBLANK_B; + vsync_val = regs->saveVSYNC_B; + pipesrc_val = regs->savePIPEBSRC; + dspstride_val = regs->saveDSPBSTRIDE; + dsplinoff_val = regs->saveDSPBLINOFF; + dsptileoff_val = regs->saveDSPBTILEOFF; + dspsize_val = regs->saveDSPBSIZE; + dsppos_val = regs->saveDSPBPOS; + dspsurf_val = regs->saveDSPBSURF; + dspcntr_val = regs->saveDSPBCNTR; + dspstatus_val = regs->saveDSPBSTATUS; + palette_val = regs->save_palette_b; + break; + case 2: + /* regester */ + pipeconf_reg = PIPECCONF; + htot_reg = HTOTAL_C; + hblank_reg = HBLANK_C; + hsync_reg = HSYNC_C; + vtot_reg = VTOTAL_C; + vblank_reg = VBLANK_C; + vsync_reg = VSYNC_C; + pipesrc_reg = PIPECSRC; + dspstride_reg = DSPCSTRIDE; + dsplinoff_reg = DSPCLINOFF; + dsptileoff_reg = DSPCTILEOFF; + dspsize_reg = DSPCSIZE; + dsppos_reg = DSPCPOS; + dspsurf_reg = DSPCSURF; + mipi_reg = MIPI_C; + dspcntr_reg = DSPCCNTR; + dspstatus_reg = PIPECSTAT; + palette_reg = PALETTE_C; + + /* values */ + pipeconf_val = regs->savePIPECCONF; + htot_val = regs->saveHTOTAL_C; + hblank_val = regs->saveHBLANK_C; + hsync_val = regs->saveHSYNC_C; + vtot_val = regs->saveVTOTAL_C; + vblank_val = regs->saveVBLANK_C; + vsync_val = regs->saveVSYNC_C; + pipesrc_val = regs->savePIPECSRC; + dspstride_val = regs->saveDSPCSTRIDE; + dsplinoff_val = regs->saveDSPCLINOFF; + dsptileoff_val = regs->saveDSPCTILEOFF; + dspsize_val = regs->saveDSPCSIZE; + dsppos_val = regs->saveDSPCPOS; + dspsurf_val = regs->saveDSPCSURF; + mipi_val = regs->saveMIPI_C; + dspcntr_val = regs->saveDSPCCNTR; + dspstatus_val = regs->saveDSPCSTATUS; + palette_val = regs->save_palette_c; + + dsi_config = dev_priv->dsi_configs[1]; + break; + default: + DRM_ERROR("%s, invalid pipe number.\n", __func__); + return -EINVAL; + } + + /*make sure VGA plane is off. it initializes to on after reset!*/ + PSB_WVDC32(0x80000000, VGACNTRL); + + if (pipe == 1) { + PSB_WVDC32(dpll_val & ~DPLL_VCO_ENABLE, dpll_reg); + PSB_RVDC32(dpll_reg); + + PSB_WVDC32(fp_val, fp_reg); + } else { + + dpll = PSB_RVDC32(dpll_reg); + + if (!(dpll & DPLL_VCO_ENABLE)) { + + /* When ungating power of DPLL, needs to wait 0.5us + before enable the VCO */ + if (dpll & MDFLD_PWR_GATE_EN) { + dpll &= ~MDFLD_PWR_GATE_EN; + PSB_WVDC32(dpll, dpll_reg); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + } + + PSB_WVDC32(fp_val, fp_reg); + PSB_WVDC32(dpll_val, dpll_reg); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + + dpll_val |= DPLL_VCO_ENABLE; + PSB_WVDC32(dpll_val, dpll_reg); + PSB_RVDC32(dpll_reg); + + /* wait for DSI PLL to lock */ + while (timeout < 20000 && + !(PSB_RVDC32(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) { + udelay(150); + timeout++; + } + + if (timeout == 20000) { + DRM_ERROR("%s, can't lock DSIPLL.\n", + __func__); + return -EINVAL; + } + } + } + /* Restore mode */ + PSB_WVDC32(htot_val, htot_reg); + PSB_WVDC32(hblank_val, hblank_reg); + PSB_WVDC32(hsync_val, hsync_reg); + PSB_WVDC32(vtot_val, vtot_reg); + PSB_WVDC32(vblank_val, vblank_reg); + PSB_WVDC32(vsync_val, vsync_reg); + PSB_WVDC32(pipesrc_val, pipesrc_reg); + PSB_WVDC32(dspstatus_val, dspstatus_reg); + + /*set up the plane*/ + PSB_WVDC32(dspstride_val, dspstride_reg); + PSB_WVDC32(dsplinoff_val, dsplinoff_reg); + PSB_WVDC32(dsptileoff_val, dsptileoff_reg); + PSB_WVDC32(dspsize_val, dspsize_reg); + PSB_WVDC32(dsppos_val, dsppos_reg); + PSB_WVDC32(dspsurf_val, dspsurf_reg); + + if (pipe == 1) { + /* restore palette (gamma) */ + /*DRM_UDELAY(50000); */ + for (i = 0; i < 256; i++) + PSB_WVDC32(palette_val[i], palette_reg + (i << 2)); + + PSB_WVDC32(regs->savePFIT_CONTROL, PFIT_CONTROL); + PSB_WVDC32(regs->savePFIT_PGM_RATIOS, PFIT_PGM_RATIOS); + + /*TODO: resume HDMI port */ + + /*TODO: resume pipe*/ + + /*enable the plane*/ + PSB_WVDC32(dspcntr_val & ~DISPLAY_PLANE_ENABLE, dspcntr_reg); + + return 0; + } + + /*set up pipe related registers*/ + PSB_WVDC32(mipi_val, mipi_reg); + + /*setup MIPI adapter + MIPI IP registers*/ + if (dsi_config) + mdfld_dsi_controller_init(dsi_config, pipe); + + if (in_atomic() || in_interrupt()) + mdelay(20); + else + msleep(20); + + /*enable the plane*/ + PSB_WVDC32(dspcntr_val, dspcntr_reg); + + if (in_atomic() || in_interrupt()) + mdelay(20); + else + msleep(20); + + /* LP Hold Release */ + temp = REG_READ(mipi_reg); + temp |= LP_OUTPUT_HOLD_RELEASE; + REG_WRITE(mipi_reg, temp); + mdelay(1); + + + /* Set DSI host to exit from Utra Low Power State */ + temp = REG_READ(device_ready_reg); + temp &= ~ULPS_MASK; + temp |= 0x3; + temp |= EXIT_ULPS_DEV_READY; + REG_WRITE(device_ready_reg, temp); + mdelay(1); + + temp = REG_READ(device_ready_reg); + temp &= ~ULPS_MASK; + temp |= EXITING_ULPS; + REG_WRITE(device_ready_reg, temp); + mdelay(1); + + /*enable the pipe*/ + PSB_WVDC32(pipeconf_val, pipeconf_reg); + + /* restore palette (gamma) */ + /*DRM_UDELAY(50000); */ + for (i = 0; i < 256; i++) + PSB_WVDC32(palette_val[i], palette_reg + (i << 2)); + + return 0; +} + +static int mdfld_save_registers(struct drm_device *dev) +{ + /* mdfld_save_cursor_overlay_registers(dev); */ + mdfld_save_display_registers(dev, 0); + mdfld_save_display_registers(dev, 2); + mdfld_disable_crtc(dev, 0); + mdfld_disable_crtc(dev, 2); + + return 0; +} + +static int mdfld_restore_registers(struct drm_device *dev) +{ + mdfld_restore_display_registers(dev, 2); + mdfld_restore_display_registers(dev, 0); + /* mdfld_restore_cursor_overlay_registers(dev); */ + + return 0; +} + +static int mdfld_power_down(struct drm_device *dev) +{ + /* FIXME */ + return 0; +} + +static int mdfld_power_up(struct drm_device *dev) +{ + /* FIXME */ + return 0; +} + +const struct psb_ops mdfld_chip_ops = { + .name = "mdfld", + .accel_2d = 0, + .pipes = 3, + .crtcs = 3, + .sgx_offset = MRST_SGX_OFFSET, + + .chip_setup = mid_chip_setup, + .crtc_helper = &mdfld_helper_funcs, + .crtc_funcs = &psb_intel_crtc_funcs, + + .output_init = mdfld_output_init, + +#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE + .backlight_init = mdfld_backlight_init, +#endif + + .save_regs = mdfld_save_registers, + .restore_regs = mdfld_restore_registers, + .power_down = mdfld_power_down, + .power_up = mdfld_power_up, +}; diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c new file mode 100644 index 000000000000..fc0df28a668c --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.c @@ -0,0 +1,1024 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * jim liu + * Jackie Li + */ + +#include "mdfld_dsi_dpi.h" +#include "mdfld_output.h" +#include "mdfld_dsi_pkg_sender.h" +#include "psb_drv.h" +#include "tc35876x-dsi-lvds.h" + +static void mdfld_dsi_dpi_shut_down(struct mdfld_dsi_dpi_output *output, + int pipe); + +static void mdfld_wait_for_HS_DATA_FIFO(struct drm_device *dev, u32 pipe) +{ + u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe); + int timeout = 0; + + udelay(500); + + /* This will time out after approximately 2+ seconds */ + while ((timeout < 20000) && + (REG_READ(gen_fifo_stat_reg) & DSI_FIFO_GEN_HS_DATA_FULL)) { + udelay(100); + timeout++; + } + + if (timeout == 20000) + DRM_INFO("MIPI: HS Data FIFO was never cleared!\n"); +} + +static void mdfld_wait_for_HS_CTRL_FIFO(struct drm_device *dev, u32 pipe) +{ + u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe); + int timeout = 0; + + udelay(500); + + /* This will time out after approximately 2+ seconds */ + while ((timeout < 20000) && (REG_READ(gen_fifo_stat_reg) + & DSI_FIFO_GEN_HS_CTRL_FULL)) { + udelay(100); + timeout++; + } + if (timeout == 20000) + DRM_INFO("MIPI: HS CMD FIFO was never cleared!\n"); +} + +static void mdfld_wait_for_DPI_CTRL_FIFO(struct drm_device *dev, u32 pipe) +{ + u32 gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe); + int timeout = 0; + + udelay(500); + + /* This will time out after approximately 2+ seconds */ + while ((timeout < 20000) && ((REG_READ(gen_fifo_stat_reg) & + DPI_FIFO_EMPTY) != DPI_FIFO_EMPTY)) { + udelay(100); + timeout++; + } + + if (timeout == 20000) + DRM_ERROR("MIPI: DPI FIFO was never cleared\n"); +} + +static void mdfld_wait_for_SPL_PKG_SENT(struct drm_device *dev, u32 pipe) +{ + u32 intr_stat_reg = MIPI_INTR_STAT_REG(pipe); + int timeout = 0; + + udelay(500); + + /* This will time out after approximately 2+ seconds */ + while ((timeout < 20000) && (!(REG_READ(intr_stat_reg) + & DSI_INTR_STATE_SPL_PKG_SENT))) { + udelay(100); + timeout++; + } + + if (timeout == 20000) + DRM_ERROR("MIPI: SPL_PKT_SENT_INTERRUPT was not sent successfully!\n"); +} + +/* For TC35876X */ + +static void dsi_set_device_ready_state(struct drm_device *dev, int state, + int pipe) +{ + REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), !!state, 0, 0); +} + +static void dsi_set_pipe_plane_enable_state(struct drm_device *dev, + int state, int pipe) +{ + struct drm_psb_private *dev_priv = dev->dev_private; + u32 pipeconf_reg = PIPEACONF; + u32 dspcntr_reg = DSPACNTR; + + u32 dspcntr = dev_priv->dspcntr[pipe]; + u32 mipi = MIPI_PORT_EN | PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX; + + if (pipe) { + pipeconf_reg = PIPECCONF; + dspcntr_reg = DSPCCNTR; + } else + mipi &= (~0x03); + + if (state) { + /*Set up pipe */ + REG_WRITE(pipeconf_reg, BIT(31)); + + if (REG_BIT_WAIT(pipeconf_reg, 1, 30)) + dev_err(&dev->pdev->dev, "%s: Pipe enable timeout\n", + __func__); + + /*Set up display plane */ + REG_WRITE(dspcntr_reg, dspcntr); + } else { + u32 dspbase_reg = pipe ? MDFLD_DSPCBASE : MRST_DSPABASE; + + /* Put DSI lanes to ULPS to disable pipe */ + REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 2, 2, 1); + REG_READ(MIPI_DEVICE_READY_REG(pipe)); /* posted write? */ + + /* LP Hold */ + REG_FLD_MOD(MIPI_PORT_CONTROL(pipe), 0, 16, 16); + REG_READ(MIPI_PORT_CONTROL(pipe)); /* posted write? */ + + /* Disable display plane */ + REG_FLD_MOD(dspcntr_reg, 0, 31, 31); + + /* Flush the plane changes ??? posted write? */ + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + REG_READ(dspbase_reg); + + /* Disable PIPE */ + REG_FLD_MOD(pipeconf_reg, 0, 31, 31); + + if (REG_BIT_WAIT(pipeconf_reg, 0, 30)) + dev_err(&dev->pdev->dev, "%s: Pipe disable timeout\n", + __func__); + + if (REG_BIT_WAIT(MIPI_GEN_FIFO_STAT_REG(pipe), 1, 28)) + dev_err(&dev->pdev->dev, "%s: FIFO not empty\n", + __func__); + } +} + +static void mdfld_dsi_configure_down(struct mdfld_dsi_encoder *dsi_encoder, + int pipe) +{ + struct mdfld_dsi_dpi_output *dpi_output = + MDFLD_DSI_DPI_OUTPUT(dsi_encoder); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_encoder_get_config(dsi_encoder); + struct drm_device *dev = dsi_config->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + + if (!dev_priv->dpi_panel_on[pipe]) { + dev_err(dev->dev, "DPI panel is already off\n"); + return; + } + tc35876x_toshiba_bridge_panel_off(dev); + tc35876x_set_bridge_reset_state(dev, 1); + dsi_set_pipe_plane_enable_state(dev, 0, pipe); + mdfld_dsi_dpi_shut_down(dpi_output, pipe); + dsi_set_device_ready_state(dev, 0, pipe); +} + +static void mdfld_dsi_configure_up(struct mdfld_dsi_encoder *dsi_encoder, + int pipe) +{ + struct mdfld_dsi_dpi_output *dpi_output = + MDFLD_DSI_DPI_OUTPUT(dsi_encoder); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_encoder_get_config(dsi_encoder); + struct drm_device *dev = dsi_config->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + + if (dev_priv->dpi_panel_on[pipe]) { + dev_err(dev->dev, "DPI panel is already on\n"); + return; + } + + /* For resume path sequence */ + mdfld_dsi_dpi_shut_down(dpi_output, pipe); + dsi_set_device_ready_state(dev, 0, pipe); + + dsi_set_device_ready_state(dev, 1, pipe); + tc35876x_set_bridge_reset_state(dev, 0); + tc35876x_configure_lvds_bridge(dev); + mdfld_dsi_dpi_turn_on(dpi_output, pipe); /* Send turn on command */ + dsi_set_pipe_plane_enable_state(dev, 1, pipe); +} +/* End for TC35876X */ + +/* ************************************************************************* *\ + * FUNCTION: mdfld_dsi_tpo_ic_init + * + * DESCRIPTION: This function is called only by mrst_dsi_mode_set and + * restore_display_registers. since this function does not + * acquire the mutex, it is important that the calling function + * does! +\* ************************************************************************* */ +static void mdfld_dsi_tpo_ic_init(struct mdfld_dsi_config *dsi_config, u32 pipe) +{ + struct drm_device *dev = dsi_config->dev; + u32 dcsChannelNumber = dsi_config->channel_num; + u32 gen_data_reg = MIPI_HS_GEN_DATA_REG(pipe); + u32 gen_ctrl_reg = MIPI_HS_GEN_CTRL_REG(pipe); + u32 gen_ctrl_val = GEN_LONG_WRITE; + + DRM_INFO("Enter mrst init TPO MIPI display.\n"); + + gen_ctrl_val |= dcsChannelNumber << DCS_CHANNEL_NUMBER_POS; + + /* Flip page order */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00008036); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x02 << WORD_COUNTS_POS)); + + /* 0xF0 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x005a5af0); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS)); + + /* Write protection key */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x005a5af1); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS)); + + /* 0xFC */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x005a5afc); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS)); + + /* 0xB7 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x770000b7); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000044); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x05 << WORD_COUNTS_POS)); + + /* 0xB6 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x000a0ab6); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS)); + + /* 0xF2 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x081010f2); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x4a070708); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x000000c5); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS)); + + /* 0xF8 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x024003f8); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x01030a04); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x0e020220); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000004); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x0d << WORD_COUNTS_POS)); + + /* 0xE2 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x398fc3e2); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x0000916f); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x06 << WORD_COUNTS_POS)); + + /* 0xB0 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x000000b0); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x02 << WORD_COUNTS_POS)); + + /* 0xF4 */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x240242f4); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x78ee2002); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x2a071050); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x507fee10); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x10300710); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x14 << WORD_COUNTS_POS)); + + /* 0xBA */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x19fe07ba); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x101c0a31); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000010); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS)); + + /* 0xBB */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x28ff07bb); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x24280a31); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000034); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x09 << WORD_COUNTS_POS)); + + /* 0xFB */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x535d05fb); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1b1a2130); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x221e180e); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x131d2120); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x535d0508); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1c1a2131); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x231f160d); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x111b2220); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x535c2008); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1f1d2433); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x2c251a10); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x2c34372d); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000023); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x31 << WORD_COUNTS_POS)); + + /* 0xFA */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x525c0bfa); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1c1c232f); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x2623190e); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x18212625); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x545d0d0e); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1e1d2333); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x26231a10); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x1a222725); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x545d280f); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x21202635); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x31292013); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x31393d33); + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x00000029); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x31 << WORD_COUNTS_POS)); + + /* Set DM */ + mdfld_wait_for_HS_DATA_FIFO(dev, pipe); + REG_WRITE(gen_data_reg, 0x000100f7); + mdfld_wait_for_HS_CTRL_FIFO(dev, pipe); + REG_WRITE(gen_ctrl_reg, gen_ctrl_val | (0x03 << WORD_COUNTS_POS)); +} + +static u16 mdfld_dsi_dpi_to_byte_clock_count(int pixel_clock_count, + int num_lane, int bpp) +{ + return (u16)((pixel_clock_count * bpp) / (num_lane * 8)); +} + +/* + * Calculate the dpi time basing on a given drm mode @mode + * return 0 on success. + * FIXME: I was using proposed mode value for calculation, may need to + * use crtc mode values later + */ +int mdfld_dsi_dpi_timing_calculation(struct drm_display_mode *mode, + struct mdfld_dsi_dpi_timing *dpi_timing, + int num_lane, int bpp) +{ + int pclk_hsync, pclk_hfp, pclk_hbp, pclk_hactive; + int pclk_vsync, pclk_vfp, pclk_vbp, pclk_vactive; + + pclk_hactive = mode->hdisplay; + pclk_hfp = mode->hsync_start - mode->hdisplay; + pclk_hsync = mode->hsync_end - mode->hsync_start; + pclk_hbp = mode->htotal - mode->hsync_end; + + pclk_vactive = mode->vdisplay; + pclk_vfp = mode->vsync_start - mode->vdisplay; + pclk_vsync = mode->vsync_end - mode->vsync_start; + pclk_vbp = mode->vtotal - mode->vsync_end; + + /* + * byte clock counts were calculated by following formula + * bclock_count = pclk_count * bpp / num_lane / 8 + */ + dpi_timing->hsync_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_hsync, num_lane, bpp); + dpi_timing->hbp_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_hbp, num_lane, bpp); + dpi_timing->hfp_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_hfp, num_lane, bpp); + dpi_timing->hactive_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_hactive, num_lane, bpp); + dpi_timing->vsync_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_vsync, num_lane, bpp); + dpi_timing->vbp_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_vbp, num_lane, bpp); + dpi_timing->vfp_count = mdfld_dsi_dpi_to_byte_clock_count( + pclk_vfp, num_lane, bpp); + + return 0; +} + +void mdfld_dsi_dpi_controller_init(struct mdfld_dsi_config *dsi_config, + int pipe) +{ + struct drm_device *dev = dsi_config->dev; + int lane_count = dsi_config->lane_count; + struct mdfld_dsi_dpi_timing dpi_timing; + struct drm_display_mode *mode = dsi_config->mode; + u32 val; + + /*un-ready device*/ + REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 0, 0, 0); + + /*init dsi adapter before kicking off*/ + REG_WRITE(MIPI_CTRL_REG(pipe), 0x00000018); + + /*enable all interrupts*/ + REG_WRITE(MIPI_INTR_EN_REG(pipe), 0xffffffff); + + /*set up func_prg*/ + val = lane_count; + val |= dsi_config->channel_num << DSI_DPI_VIRT_CHANNEL_OFFSET; + + switch (dsi_config->bpp) { + case 16: + val |= DSI_DPI_COLOR_FORMAT_RGB565; + break; + case 18: + val |= DSI_DPI_COLOR_FORMAT_RGB666; + break; + case 24: + val |= DSI_DPI_COLOR_FORMAT_RGB888; + break; + default: + DRM_ERROR("unsupported color format, bpp = %d\n", + dsi_config->bpp); + } + REG_WRITE(MIPI_DSI_FUNC_PRG_REG(pipe), val); + + REG_WRITE(MIPI_HS_TX_TIMEOUT_REG(pipe), + (mode->vtotal * mode->htotal * dsi_config->bpp / + (8 * lane_count)) & DSI_HS_TX_TIMEOUT_MASK); + REG_WRITE(MIPI_LP_RX_TIMEOUT_REG(pipe), + 0xffff & DSI_LP_RX_TIMEOUT_MASK); + + /*max value: 20 clock cycles of txclkesc*/ + REG_WRITE(MIPI_TURN_AROUND_TIMEOUT_REG(pipe), + 0x14 & DSI_TURN_AROUND_TIMEOUT_MASK); + + /*min 21 txclkesc, max: ffffh*/ + REG_WRITE(MIPI_DEVICE_RESET_TIMER_REG(pipe), + 0xffff & DSI_RESET_TIMER_MASK); + + REG_WRITE(MIPI_DPI_RESOLUTION_REG(pipe), + mode->vdisplay << 16 | mode->hdisplay); + + /*set DPI timing registers*/ + mdfld_dsi_dpi_timing_calculation(mode, &dpi_timing, + dsi_config->lane_count, dsi_config->bpp); + + REG_WRITE(MIPI_HSYNC_COUNT_REG(pipe), + dpi_timing.hsync_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HBP_COUNT_REG(pipe), + dpi_timing.hbp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HFP_COUNT_REG(pipe), + dpi_timing.hfp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HACTIVE_COUNT_REG(pipe), + dpi_timing.hactive_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VSYNC_COUNT_REG(pipe), + dpi_timing.vsync_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VBP_COUNT_REG(pipe), + dpi_timing.vbp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VFP_COUNT_REG(pipe), + dpi_timing.vfp_count & DSI_DPI_TIMING_MASK); + + REG_WRITE(MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe), 0x46); + + /*min: 7d0 max: 4e20*/ + REG_WRITE(MIPI_INIT_COUNT_REG(pipe), 0x000007d0); + + /*set up video mode*/ + val = dsi_config->video_mode | DSI_DPI_COMPLETE_LAST_LINE; + REG_WRITE(MIPI_VIDEO_MODE_FORMAT_REG(pipe), val); + + REG_WRITE(MIPI_EOT_DISABLE_REG(pipe), 0x00000000); + + REG_WRITE(MIPI_LP_BYTECLK_REG(pipe), 0x00000004); + + /*TODO: figure out how to setup these registers*/ + if (mdfld_get_panel_type(dev, pipe) == TC35876X) + REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x2A0c6008); + else + REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x150c3408); + + REG_WRITE(MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe), (0xa << 16) | 0x14); + + if (mdfld_get_panel_type(dev, pipe) == TC35876X) + tc35876x_set_bridge_reset_state(dev, 0); /*Pull High Reset */ + + /*set device ready*/ + REG_FLD_MOD(MIPI_DEVICE_READY_REG(pipe), 1, 0, 0); +} + +void mdfld_dsi_dpi_turn_on(struct mdfld_dsi_dpi_output *output, int pipe) +{ + struct drm_device *dev = output->dev; + + /* clear special packet sent bit */ + if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT) + REG_WRITE(MIPI_INTR_STAT_REG(pipe), + DSI_INTR_STATE_SPL_PKG_SENT); + + /*send turn on package*/ + REG_WRITE(MIPI_DPI_CONTROL_REG(pipe), DSI_DPI_CTRL_HS_TURN_ON); + + /*wait for SPL_PKG_SENT interrupt*/ + mdfld_wait_for_SPL_PKG_SENT(dev, pipe); + + if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT) + REG_WRITE(MIPI_INTR_STAT_REG(pipe), + DSI_INTR_STATE_SPL_PKG_SENT); + + output->panel_on = 1; + + /* FIXME the following is disabled to WA the X slow start issue + for TMD panel + if (pipe == 2) + dev_priv->dpi_panel_on2 = true; + else if (pipe == 0) + dev_priv->dpi_panel_on = true; */ +} + +static void mdfld_dsi_dpi_shut_down(struct mdfld_dsi_dpi_output *output, + int pipe) +{ + struct drm_device *dev = output->dev; + + /*if output is on, or mode setting didn't happen, ignore this*/ + if ((!output->panel_on) || output->first_boot) { + output->first_boot = 0; + return; + } + + /* Wait for dpi fifo to empty */ + mdfld_wait_for_DPI_CTRL_FIFO(dev, pipe); + + /* Clear the special packet interrupt bit if set */ + if (REG_READ(MIPI_INTR_STAT_REG(pipe)) & DSI_INTR_STATE_SPL_PKG_SENT) + REG_WRITE(MIPI_INTR_STAT_REG(pipe), + DSI_INTR_STATE_SPL_PKG_SENT); + + if (REG_READ(MIPI_DPI_CONTROL_REG(pipe)) == DSI_DPI_CTRL_HS_SHUTDOWN) + goto shutdown_out; + + REG_WRITE(MIPI_DPI_CONTROL_REG(pipe), DSI_DPI_CTRL_HS_SHUTDOWN); + +shutdown_out: + output->panel_on = 0; + output->first_boot = 0; + + /* FIXME the following is disabled to WA the X slow start issue + for TMD panel + if (pipe == 2) + dev_priv->dpi_panel_on2 = false; + else if (pipe == 0) + dev_priv->dpi_panel_on = false; */ +} + +static void mdfld_dsi_dpi_set_power(struct drm_encoder *encoder, bool on) +{ + struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder); + struct mdfld_dsi_dpi_output *dpi_output = + MDFLD_DSI_DPI_OUTPUT(dsi_encoder); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_encoder_get_config(dsi_encoder); + int pipe = mdfld_dsi_encoder_get_pipe(dsi_encoder); + struct drm_device *dev = dsi_config->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + u32 pipeconf_reg = PIPEACONF; + + if (pipe) + pipeconf_reg = PIPECCONF; + + /*start up display island if it was shutdown*/ + if (!gma_power_begin(dev, true)) + return; + + if (on) { + if (mdfld_get_panel_type(dev, pipe) == TMD_VID) + mdfld_dsi_dpi_turn_on(dpi_output, pipe); + else if (mdfld_get_panel_type(dev, pipe) == TC35876X) + mdfld_dsi_configure_up(dsi_encoder, pipe); + else { + /*enable mipi port*/ + REG_WRITE(MIPI_PORT_CONTROL(pipe), + REG_READ(MIPI_PORT_CONTROL(pipe)) | BIT(31)); + REG_READ(MIPI_PORT_CONTROL(pipe)); + + mdfld_dsi_dpi_turn_on(dpi_output, pipe); + mdfld_dsi_tpo_ic_init(dsi_config, pipe); + } + dev_priv->dpi_panel_on[pipe] = true; + } else { + if (mdfld_get_panel_type(dev, pipe) == TMD_VID) + mdfld_dsi_dpi_shut_down(dpi_output, pipe); + else if (mdfld_get_panel_type(dev, pipe) == TC35876X) + mdfld_dsi_configure_down(dsi_encoder, pipe); + else { + mdfld_dsi_dpi_shut_down(dpi_output, pipe); + + /*disable mipi port*/ + REG_WRITE(MIPI_PORT_CONTROL(pipe), + REG_READ(MIPI_PORT_CONTROL(pipe)) & ~BIT(31)); + REG_READ(MIPI_PORT_CONTROL(pipe)); + } + dev_priv->dpi_panel_on[pipe] = false; + } + gma_power_end(dev); +} + +void mdfld_dsi_dpi_dpms(struct drm_encoder *encoder, int mode) +{ + mdfld_dsi_dpi_set_power(encoder, mode == DRM_MODE_DPMS_ON); +} + +bool mdfld_dsi_dpi_mode_fixup(struct drm_encoder *encoder, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode) +{ + struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_encoder_get_config(dsi_encoder); + struct drm_display_mode *fixed_mode = dsi_config->fixed_mode; + + if (fixed_mode) { + adjusted_mode->hdisplay = fixed_mode->hdisplay; + adjusted_mode->hsync_start = fixed_mode->hsync_start; + adjusted_mode->hsync_end = fixed_mode->hsync_end; + adjusted_mode->htotal = fixed_mode->htotal; + adjusted_mode->vdisplay = fixed_mode->vdisplay; + adjusted_mode->vsync_start = fixed_mode->vsync_start; + adjusted_mode->vsync_end = fixed_mode->vsync_end; + adjusted_mode->vtotal = fixed_mode->vtotal; + adjusted_mode->clock = fixed_mode->clock; + drm_mode_set_crtcinfo(adjusted_mode, CRTC_INTERLACE_HALVE_V); + } + return true; +} + +void mdfld_dsi_dpi_prepare(struct drm_encoder *encoder) +{ + mdfld_dsi_dpi_set_power(encoder, false); +} + +void mdfld_dsi_dpi_commit(struct drm_encoder *encoder) +{ + mdfld_dsi_dpi_set_power(encoder, true); +} + +/* For TC35876X */ +/* This functionality was implemented in FW in iCDK */ +/* But removed in DV0 and later. So need to add here. */ +static void mipi_set_properties(struct mdfld_dsi_config *dsi_config, int pipe) +{ + struct drm_device *dev = dsi_config->dev; + + REG_WRITE(MIPI_CTRL_REG(pipe), 0x00000018); + REG_WRITE(MIPI_INTR_EN_REG(pipe), 0xffffffff); + REG_WRITE(MIPI_HS_TX_TIMEOUT_REG(pipe), 0xffffff); + REG_WRITE(MIPI_LP_RX_TIMEOUT_REG(pipe), 0xffffff); + REG_WRITE(MIPI_TURN_AROUND_TIMEOUT_REG(pipe), 0x14); + REG_WRITE(MIPI_DEVICE_RESET_TIMER_REG(pipe), 0xff); + REG_WRITE(MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe), 0x25); + REG_WRITE(MIPI_INIT_COUNT_REG(pipe), 0xf0); + REG_WRITE(MIPI_EOT_DISABLE_REG(pipe), 0x00000000); + REG_WRITE(MIPI_LP_BYTECLK_REG(pipe), 0x00000004); + REG_WRITE(MIPI_DBI_BW_CTRL_REG(pipe), 0x00000820); + REG_WRITE(MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe), (0xa << 16) | 0x14); +} + +static void mdfld_mipi_set_video_timing(struct mdfld_dsi_config *dsi_config, + int pipe) +{ + struct drm_device *dev = dsi_config->dev; + struct mdfld_dsi_dpi_timing dpi_timing; + struct drm_display_mode *mode = dsi_config->mode; + + mdfld_dsi_dpi_timing_calculation(mode, &dpi_timing, + dsi_config->lane_count, + dsi_config->bpp); + + REG_WRITE(MIPI_DPI_RESOLUTION_REG(pipe), + mode->vdisplay << 16 | mode->hdisplay); + REG_WRITE(MIPI_HSYNC_COUNT_REG(pipe), + dpi_timing.hsync_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HBP_COUNT_REG(pipe), + dpi_timing.hbp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HFP_COUNT_REG(pipe), + dpi_timing.hfp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_HACTIVE_COUNT_REG(pipe), + dpi_timing.hactive_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VSYNC_COUNT_REG(pipe), + dpi_timing.vsync_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VBP_COUNT_REG(pipe), + dpi_timing.vbp_count & DSI_DPI_TIMING_MASK); + REG_WRITE(MIPI_VFP_COUNT_REG(pipe), + dpi_timing.vfp_count & DSI_DPI_TIMING_MASK); +} + +static void mdfld_mipi_config(struct mdfld_dsi_config *dsi_config, int pipe) +{ + struct drm_device *dev = dsi_config->dev; + int lane_count = dsi_config->lane_count; + + if (pipe) { + REG_WRITE(MIPI_PORT_CONTROL(0), 0x00000002); + REG_WRITE(MIPI_PORT_CONTROL(2), 0x80000000); + } else { + REG_WRITE(MIPI_PORT_CONTROL(0), 0x80010000); + REG_WRITE(MIPI_PORT_CONTROL(2), 0x00); + } + + REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x150A600F); + REG_WRITE(MIPI_VIDEO_MODE_FORMAT_REG(pipe), 0x0000000F); + + /* lane_count = 3 */ + REG_WRITE(MIPI_DSI_FUNC_PRG_REG(pipe), 0x00000200 | lane_count); + + mdfld_mipi_set_video_timing(dsi_config, pipe); +} + +static void mdfld_set_pipe_timing(struct mdfld_dsi_config *dsi_config, int pipe) +{ + struct drm_device *dev = dsi_config->dev; + struct drm_display_mode *mode = dsi_config->mode; + + REG_WRITE(HTOTAL_A, ((mode->htotal - 1) << 16) | (mode->hdisplay - 1)); + REG_WRITE(HBLANK_A, ((mode->htotal - 1) << 16) | (mode->hdisplay - 1)); + REG_WRITE(HSYNC_A, + ((mode->hsync_end - 1) << 16) | (mode->hsync_start - 1)); + + REG_WRITE(VTOTAL_A, ((mode->vtotal - 1) << 16) | (mode->vdisplay - 1)); + REG_WRITE(VBLANK_A, ((mode->vtotal - 1) << 16) | (mode->vdisplay - 1)); + REG_WRITE(VSYNC_A, + ((mode->vsync_end - 1) << 16) | (mode->vsync_start - 1)); + + REG_WRITE(PIPEASRC, + ((mode->hdisplay - 1) << 16) | (mode->vdisplay - 1)); +} +/* End for TC35876X */ + +void mdfld_dsi_dpi_mode_set(struct drm_encoder *encoder, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode) +{ + struct mdfld_dsi_encoder *dsi_encoder = mdfld_dsi_encoder(encoder); + struct mdfld_dsi_dpi_output *dpi_output = + MDFLD_DSI_DPI_OUTPUT(dsi_encoder); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_encoder_get_config(dsi_encoder); + struct drm_device *dev = dsi_config->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + int pipe = mdfld_dsi_encoder_get_pipe(dsi_encoder); + + u32 pipeconf_reg = PIPEACONF; + u32 dspcntr_reg = DSPACNTR; + + u32 pipeconf = dev_priv->pipeconf[pipe]; + u32 dspcntr = dev_priv->dspcntr[pipe]; + u32 mipi = MIPI_PORT_EN | PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX; + + if (pipe) { + pipeconf_reg = PIPECCONF; + dspcntr_reg = DSPCCNTR; + } else { + if (mdfld_get_panel_type(dev, pipe) == TC35876X) + mipi &= (~0x03); /* Use all four lanes */ + else + mipi |= 2; + } + + /*start up display island if it was shutdown*/ + if (!gma_power_begin(dev, true)) + return; + + if (mdfld_get_panel_type(dev, pipe) == TC35876X) { + /* + * The following logic is required to reset the bridge and + * configure. This also starts the DSI clock at 200MHz. + */ + tc35876x_set_bridge_reset_state(dev, 0); /*Pull High Reset */ + tc35876x_toshiba_bridge_panel_on(dev); + udelay(100); + /* Now start the DSI clock */ + REG_WRITE(MRST_DPLL_A, 0x00); + REG_WRITE(MRST_FPA0, 0xC1); + REG_WRITE(MRST_DPLL_A, 0x00800000); + udelay(500); + REG_WRITE(MRST_DPLL_A, 0x80800000); + + if (REG_BIT_WAIT(pipeconf_reg, 1, 29)) + dev_err(&dev->pdev->dev, "%s: DSI PLL lock timeout\n", + __func__); + + REG_WRITE(MIPI_DPHY_PARAM_REG(pipe), 0x2A0c6008); + + mipi_set_properties(dsi_config, pipe); + mdfld_mipi_config(dsi_config, pipe); + mdfld_set_pipe_timing(dsi_config, pipe); + + REG_WRITE(DSPABASE, 0x00); + REG_WRITE(DSPASTRIDE, (mode->hdisplay * 4)); + REG_WRITE(DSPASIZE, + ((mode->vdisplay - 1) << 16) | (mode->hdisplay - 1)); + + REG_WRITE(DSPACNTR, 0x98000000); + REG_WRITE(DSPASURF, 0x00); + + REG_WRITE(VGACNTRL, 0x80000000); + REG_WRITE(DEVICE_READY_REG, 0x00000001); + + REG_WRITE(MIPI_PORT_CONTROL(pipe), 0x80810000); + } else { + /*set up mipi port FIXME: do at init time */ + REG_WRITE(MIPI_PORT_CONTROL(pipe), mipi); + } + REG_READ(MIPI_PORT_CONTROL(pipe)); + + if (mdfld_get_panel_type(dev, pipe) == TMD_VID) { + /* NOP */ + } else if (mdfld_get_panel_type(dev, pipe) == TC35876X) { + /* set up DSI controller DPI interface */ + mdfld_dsi_dpi_controller_init(dsi_config, pipe); + + /* Configure MIPI Bridge and Panel */ + tc35876x_configure_lvds_bridge(dev); + dev_priv->dpi_panel_on[pipe] = true; + } else { + /*turn on DPI interface*/ + mdfld_dsi_dpi_turn_on(dpi_output, pipe); + } + + /*set up pipe*/ + REG_WRITE(pipeconf_reg, pipeconf); + REG_READ(pipeconf_reg); + + /*set up display plane*/ + REG_WRITE(dspcntr_reg, dspcntr); + REG_READ(dspcntr_reg); + + msleep(20); /* FIXME: this should wait for vblank */ + + if (mdfld_get_panel_type(dev, pipe) == TMD_VID) { + /* NOP */ + } else if (mdfld_get_panel_type(dev, pipe) == TC35876X) { + mdfld_dsi_dpi_turn_on(dpi_output, pipe); + } else { + /* init driver ic */ + mdfld_dsi_tpo_ic_init(dsi_config, pipe); + /*init backlight*/ + mdfld_dsi_brightness_init(dsi_config, pipe); + } + + gma_power_end(dev); +} + +/* + * Init DSI DPI encoder. + * Allocate an mdfld_dsi_encoder and attach it to given @dsi_connector + * return pointer of newly allocated DPI encoder, NULL on error + */ +struct mdfld_dsi_encoder *mdfld_dsi_dpi_init(struct drm_device *dev, + struct mdfld_dsi_connector *dsi_connector, + const struct panel_funcs *p_funcs) +{ + struct mdfld_dsi_dpi_output *dpi_output = NULL; + struct mdfld_dsi_config *dsi_config; + struct drm_connector *connector = NULL; + struct drm_encoder *encoder = NULL; + struct drm_display_mode *fixed_mode = NULL; + int pipe; + u32 data; + int ret; + + pipe = dsi_connector->pipe; + + if (mdfld_get_panel_type(dev, pipe) != TC35876X) { + dsi_config = mdfld_dsi_get_config(dsi_connector); + + /* panel hard-reset */ + if (p_funcs->reset) { + ret = p_funcs->reset(pipe); + if (ret) { + DRM_ERROR("Panel %d hard-reset failed\n", pipe); + return NULL; + } + } + + /* panel drvIC init */ + if (p_funcs->drv_ic_init) + p_funcs->drv_ic_init(dsi_config, pipe); + + /* panel power mode detect */ + ret = mdfld_dsi_get_power_mode(dsi_config, &data, false); + if (ret) { + DRM_ERROR("Panel %d get power mode failed\n", pipe); + dsi_connector->status = connector_status_disconnected; + } else { + DRM_INFO("pipe %d power mode 0x%x\n", pipe, data); + dsi_connector->status = connector_status_connected; + } + } + + dpi_output = kzalloc(sizeof(struct mdfld_dsi_dpi_output), GFP_KERNEL); + if (!dpi_output) { + DRM_ERROR("No memory\n"); + return NULL; + } + + if (dsi_connector->pipe) + dpi_output->panel_on = 0; + else + dpi_output->panel_on = 0; + + dpi_output->dev = dev; + if (mdfld_get_panel_type(dev, pipe) != TC35876X) + dpi_output->p_funcs = p_funcs; + dpi_output->first_boot = 1; + + /*get fixed mode*/ + dsi_config = mdfld_dsi_get_config(dsi_connector); + fixed_mode = dsi_config->fixed_mode; + + /*create drm encoder object*/ + connector = &dsi_connector->base.base; + encoder = &dpi_output->base.base.base; + drm_encoder_init(dev, + encoder, + p_funcs->encoder_funcs, + DRM_MODE_ENCODER_LVDS); + drm_encoder_helper_add(encoder, + p_funcs->encoder_helper_funcs); + + /*attach to given connector*/ + drm_mode_connector_attach_encoder(connector, encoder); + + /*set possible crtcs and clones*/ + if (dsi_connector->pipe) { + encoder->possible_crtcs = (1 << 2); + encoder->possible_clones = (1 << 1); + } else { + encoder->possible_crtcs = (1 << 0); + encoder->possible_clones = (1 << 0); + } + + dsi_connector->base.encoder = &dpi_output->base.base; + + return &dpi_output->base; +} diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h new file mode 100644 index 000000000000..6f762478b959 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_dpi.h @@ -0,0 +1,79 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * jim liu + * Jackie Li + */ + +#ifndef __MDFLD_DSI_DPI_H__ +#define __MDFLD_DSI_DPI_H__ + +#include "mdfld_dsi_output.h" +#include "mdfld_output.h" + +struct mdfld_dsi_dpi_timing { + u16 hsync_count; + u16 hbp_count; + u16 hfp_count; + u16 hactive_count; + u16 vsync_count; + u16 vbp_count; + u16 vfp_count; +}; + +struct mdfld_dsi_dpi_output { + struct mdfld_dsi_encoder base; + struct drm_device *dev; + + int panel_on; + int first_boot; + + const struct panel_funcs *p_funcs; +}; + +#define MDFLD_DSI_DPI_OUTPUT(dsi_encoder)\ + container_of(dsi_encoder, struct mdfld_dsi_dpi_output, base) + +/* Export functions */ +extern int mdfld_dsi_dpi_timing_calculation(struct drm_display_mode *mode, + struct mdfld_dsi_dpi_timing *dpi_timing, + int num_lane, int bpp); +extern struct mdfld_dsi_encoder *mdfld_dsi_dpi_init(struct drm_device *dev, + struct mdfld_dsi_connector *dsi_connector, + const struct panel_funcs *p_funcs); + +/* MDFLD DPI helper functions */ +extern void mdfld_dsi_dpi_dpms(struct drm_encoder *encoder, int mode); +extern bool mdfld_dsi_dpi_mode_fixup(struct drm_encoder *encoder, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode); +extern void mdfld_dsi_dpi_prepare(struct drm_encoder *encoder); +extern void mdfld_dsi_dpi_commit(struct drm_encoder *encoder); +extern void mdfld_dsi_dpi_mode_set(struct drm_encoder *encoder, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode); +extern void mdfld_dsi_dpi_turn_on(struct mdfld_dsi_dpi_output *output, + int pipe); +extern void mdfld_dsi_dpi_controller_init(struct mdfld_dsi_config *dsi_config, + int pipe); +#endif /*__MDFLD_DSI_DPI_H__*/ diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_output.c b/drivers/gpu/drm/gma500/mdfld_dsi_output.c new file mode 100644 index 000000000000..9338c28f3999 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_output.c @@ -0,0 +1,635 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * jim liu + * Jackie Li + */ + +#include + +#include "mdfld_dsi_output.h" +#include "mdfld_dsi_dpi.h" +#include "mdfld_output.h" +#include "mdfld_dsi_pkg_sender.h" +#include "tc35876x-dsi-lvds.h" +#include +#include + +/* get the LABC from command line. */ +static int LABC_control = 1; + +#ifdef MODULE +module_param(LABC_control, int, 0644); +#else + +static int __init parse_LABC_control(char *arg) +{ + /* LABC control can be passed in as a cmdline parameter */ + /* to enable this feature add LABC=1 to cmdline */ + /* to disable this feature add LABC=0 to cmdline */ + if (!arg) + return -EINVAL; + + if (!strcasecmp(arg, "0")) + LABC_control = 0; + else if (!strcasecmp(arg, "1")) + LABC_control = 1; + + return 0; +} +early_param("LABC", parse_LABC_control); +#endif + +/** + * Check and see if the generic control or data buffer is empty and ready. + */ +void mdfld_dsi_gen_fifo_ready(struct drm_device *dev, u32 gen_fifo_stat_reg, + u32 fifo_stat) +{ + u32 GEN_BF_time_out_count; + + /* Check MIPI Adatper command registers */ + for (GEN_BF_time_out_count = 0; + GEN_BF_time_out_count < GEN_FB_TIME_OUT; + GEN_BF_time_out_count++) { + if ((REG_READ(gen_fifo_stat_reg) & fifo_stat) == fifo_stat) + break; + udelay(100); + } + + if (GEN_BF_time_out_count == GEN_FB_TIME_OUT) + DRM_ERROR("mdfld_dsi_gen_fifo_ready, Timeout. gen_fifo_stat_reg = 0x%x.\n", + gen_fifo_stat_reg); +} + +/** + * Manage the DSI MIPI keyboard and display brightness. + * FIXME: this is exported to OSPM code. should work out an specific + * display interface to OSPM. + */ + +void mdfld_dsi_brightness_init(struct mdfld_dsi_config *dsi_config, int pipe) +{ + struct mdfld_dsi_pkg_sender *sender = + mdfld_dsi_get_pkg_sender(dsi_config); + struct drm_device *dev = sender->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + u32 gen_ctrl_val; + + if (!sender) { + DRM_ERROR("No sender found\n"); + return; + } + + /* Set default display backlight value to 85% (0xd8)*/ + mdfld_dsi_send_mcs_short(sender, write_display_brightness, 0xd8, 1, + true); + + /* Set minimum brightness setting of CABC function to 20% (0x33)*/ + mdfld_dsi_send_mcs_short(sender, write_cabc_min_bright, 0x33, 1, true); + + /* Enable backlight or/and LABC */ + gen_ctrl_val = BRIGHT_CNTL_BLOCK_ON | DISPLAY_DIMMING_ON | + BACKLIGHT_ON; + if (LABC_control == 1) + gen_ctrl_val |= DISPLAY_DIMMING_ON | DISPLAY_BRIGHTNESS_AUTO + | GAMMA_AUTO; + + if (LABC_control == 1) + gen_ctrl_val |= AMBIENT_LIGHT_SENSE_ON; + + dev_priv->mipi_ctrl_display = gen_ctrl_val; + + mdfld_dsi_send_mcs_short(sender, write_ctrl_display, (u8)gen_ctrl_val, + 1, true); + + mdfld_dsi_send_mcs_short(sender, write_ctrl_cabc, UI_IMAGE, 1, true); +} + +void mdfld_dsi_brightness_control(struct drm_device *dev, int pipe, int level) +{ + struct mdfld_dsi_pkg_sender *sender; + struct drm_psb_private *dev_priv; + struct mdfld_dsi_config *dsi_config; + u32 gen_ctrl_val = 0; + int p_type = TMD_VID; + + if (!dev || (pipe != 0 && pipe != 2)) { + DRM_ERROR("Invalid parameter\n"); + return; + } + + p_type = mdfld_get_panel_type(dev, 0); + + dev_priv = dev->dev_private; + + if (pipe) + dsi_config = dev_priv->dsi_configs[1]; + else + dsi_config = dev_priv->dsi_configs[0]; + + sender = mdfld_dsi_get_pkg_sender(dsi_config); + + if (!sender) { + DRM_ERROR("No sender found\n"); + return; + } + + gen_ctrl_val = (level * 0xff / MDFLD_DSI_BRIGHTNESS_MAX_LEVEL) & 0xff; + + dev_dbg(sender->dev->dev, "pipe = %d, gen_ctrl_val = %d.\n", + pipe, gen_ctrl_val); + + if (p_type == TMD_VID) { + /* Set display backlight value */ + mdfld_dsi_send_mcs_short(sender, tmd_write_display_brightness, + (u8)gen_ctrl_val, 1, true); + } else { + /* Set display backlight value */ + mdfld_dsi_send_mcs_short(sender, write_display_brightness, + (u8)gen_ctrl_val, 1, true); + + /* Enable backlight control */ + if (level == 0) + gen_ctrl_val = 0; + else + gen_ctrl_val = dev_priv->mipi_ctrl_display; + + mdfld_dsi_send_mcs_short(sender, write_ctrl_display, + (u8)gen_ctrl_val, 1, true); + } +} + +static int mdfld_dsi_get_panel_status(struct mdfld_dsi_config *dsi_config, + u8 dcs, u32 *data, bool hs) +{ + struct mdfld_dsi_pkg_sender *sender + = mdfld_dsi_get_pkg_sender(dsi_config); + + if (!sender || !data) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + return mdfld_dsi_read_mcs(sender, dcs, data, 1, hs); +} + +int mdfld_dsi_get_power_mode(struct mdfld_dsi_config *dsi_config, u32 *mode, + bool hs) +{ + if (!dsi_config || !mode) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + return mdfld_dsi_get_panel_status(dsi_config, 0x0a, mode, hs); +} + +int mdfld_dsi_get_diagnostic_result(struct mdfld_dsi_config *dsi_config, + u32 *result, bool hs) +{ + if (!dsi_config || !result) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + return mdfld_dsi_get_panel_status(dsi_config, 0x0f, result, hs); +} + +/* + * NOTE: this function was used by OSPM. + * TODO: will be removed later, should work out display interfaces for OSPM + */ +void mdfld_dsi_controller_init(struct mdfld_dsi_config *dsi_config, int pipe) +{ + if (!dsi_config || ((pipe != 0) && (pipe != 2))) { + DRM_ERROR("Invalid parameters\n"); + return; + } + + mdfld_dsi_dpi_controller_init(dsi_config, pipe); +} + +static void mdfld_dsi_connector_save(struct drm_connector *connector) +{ +} + +static void mdfld_dsi_connector_restore(struct drm_connector *connector) +{ +} + +/* FIXME: start using the force parameter */ +static enum drm_connector_status +mdfld_dsi_connector_detect(struct drm_connector *connector, bool force) +{ + struct mdfld_dsi_connector *dsi_connector + = mdfld_dsi_connector(connector); + + dsi_connector->status = connector_status_connected; + + return dsi_connector->status; +} + +static int mdfld_dsi_connector_set_property(struct drm_connector *connector, + struct drm_property *property, + uint64_t value) +{ + struct drm_encoder *encoder = connector->encoder; + struct backlight_device *psb_bd; + + if (!strcmp(property->name, "scaling mode") && encoder) { + struct psb_intel_crtc *psb_crtc = + to_psb_intel_crtc(encoder->crtc); + bool centerechange; + uint64_t val; + + if (!psb_crtc) + goto set_prop_error; + + switch (value) { + case DRM_MODE_SCALE_FULLSCREEN: + break; + case DRM_MODE_SCALE_NO_SCALE: + break; + case DRM_MODE_SCALE_ASPECT: + break; + default: + goto set_prop_error; + } + + if (drm_connector_property_get_value(connector, property, &val)) + goto set_prop_error; + + if (val == value) + goto set_prop_done; + + if (drm_connector_property_set_value(connector, + property, value)) + goto set_prop_error; + + centerechange = (val == DRM_MODE_SCALE_NO_SCALE) || + (value == DRM_MODE_SCALE_NO_SCALE); + + if (psb_crtc->saved_mode.hdisplay != 0 && + psb_crtc->saved_mode.vdisplay != 0) { + if (centerechange) { + if (!drm_crtc_helper_set_mode(encoder->crtc, + &psb_crtc->saved_mode, + encoder->crtc->x, + encoder->crtc->y, + encoder->crtc->fb)) + goto set_prop_error; + } else { + struct drm_encoder_helper_funcs *funcs = + encoder->helper_private; + funcs->mode_set(encoder, + &psb_crtc->saved_mode, + &psb_crtc->saved_adjusted_mode); + } + } + } else if (!strcmp(property->name, "backlight") && encoder) { + if (drm_connector_property_set_value(connector, property, + value)) + goto set_prop_error; + else { + psb_bd = mdfld_get_backlight_device(); + if (psb_bd) { + psb_bd->props.brightness = value; + mdfld_set_brightness(psb_bd); + } + } + } +set_prop_done: + return 0; +set_prop_error: + return -1; +} + +static void mdfld_dsi_connector_destroy(struct drm_connector *connector) +{ + struct mdfld_dsi_connector *dsi_connector = + mdfld_dsi_connector(connector); + struct mdfld_dsi_pkg_sender *sender; + + if (!dsi_connector) + return; + drm_sysfs_connector_remove(connector); + drm_connector_cleanup(connector); + sender = dsi_connector->pkg_sender; + mdfld_dsi_pkg_sender_destroy(sender); + kfree(dsi_connector); +} + +static int mdfld_dsi_connector_get_modes(struct drm_connector *connector) +{ + struct mdfld_dsi_connector *dsi_connector = + mdfld_dsi_connector(connector); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_get_config(dsi_connector); + struct drm_display_mode *fixed_mode = dsi_config->fixed_mode; + struct drm_display_mode *dup_mode = NULL; + struct drm_device *dev = connector->dev; + + connector->display_info.min_vfreq = 0; + connector->display_info.max_vfreq = 200; + connector->display_info.min_hfreq = 0; + connector->display_info.max_hfreq = 200; + + if (fixed_mode) { + dev_dbg(dev->dev, "fixed_mode %dx%d\n", + fixed_mode->hdisplay, fixed_mode->vdisplay); + dup_mode = drm_mode_duplicate(dev, fixed_mode); + drm_mode_probed_add(connector, dup_mode); + return 1; + } + DRM_ERROR("Didn't get any modes!\n"); + return 0; +} + +static int mdfld_dsi_connector_mode_valid(struct drm_connector *connector, + struct drm_display_mode *mode) +{ + struct mdfld_dsi_connector *dsi_connector = + mdfld_dsi_connector(connector); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_get_config(dsi_connector); + struct drm_display_mode *fixed_mode = dsi_config->fixed_mode; + + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + + if (mode->flags & DRM_MODE_FLAG_INTERLACE) + return MODE_NO_INTERLACE; + + /** + * FIXME: current DC has no fitting unit, reject any mode setting + * request + * Will figure out a way to do up-scaling(pannel fitting) later. + **/ + if (fixed_mode) { + if (mode->hdisplay != fixed_mode->hdisplay) + return MODE_PANEL; + + if (mode->vdisplay != fixed_mode->vdisplay) + return MODE_PANEL; + } + + return MODE_OK; +} + +static void mdfld_dsi_connector_dpms(struct drm_connector *connector, int mode) +{ + if (mode == connector->dpms) + return; + + /*first, execute dpms*/ + + drm_helper_connector_dpms(connector, mode); +} + +static struct drm_encoder *mdfld_dsi_connector_best_encoder( + struct drm_connector *connector) +{ + struct mdfld_dsi_connector *dsi_connector = + mdfld_dsi_connector(connector); + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_get_config(dsi_connector); + return &dsi_config->encoder->base.base; +} + +/*DSI connector funcs*/ +static const struct drm_connector_funcs mdfld_dsi_connector_funcs = { + .dpms = /*drm_helper_connector_dpms*/mdfld_dsi_connector_dpms, + .save = mdfld_dsi_connector_save, + .restore = mdfld_dsi_connector_restore, + .detect = mdfld_dsi_connector_detect, + .fill_modes = drm_helper_probe_single_connector_modes, + .set_property = mdfld_dsi_connector_set_property, + .destroy = mdfld_dsi_connector_destroy, +}; + +/*DSI connector helper funcs*/ +static const struct drm_connector_helper_funcs + mdfld_dsi_connector_helper_funcs = { + .get_modes = mdfld_dsi_connector_get_modes, + .mode_valid = mdfld_dsi_connector_mode_valid, + .best_encoder = mdfld_dsi_connector_best_encoder, +}; + +static int mdfld_dsi_get_default_config(struct drm_device *dev, + struct mdfld_dsi_config *config, int pipe) +{ + if (!dev || !config) { + DRM_ERROR("Invalid parameters"); + return -EINVAL; + } + + config->bpp = 24; + if (mdfld_get_panel_type(dev, pipe) == TC35876X) + config->lane_count = 4; + else + config->lane_count = 2; + config->channel_num = 0; + + if (mdfld_get_panel_type(dev, pipe) == TMD_VID) + config->video_mode = MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_PULSE; + else if (mdfld_get_panel_type(dev, pipe) == TC35876X) + config->video_mode = + MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_EVENTS; + else + config->video_mode = MDFLD_DSI_VIDEO_BURST_MODE; + + return 0; +} + +int mdfld_dsi_panel_reset(int pipe) +{ + unsigned gpio; + int ret = 0; + + switch (pipe) { + case 0: + gpio = 128; + break; + case 2: + gpio = 34; + break; + default: + DRM_ERROR("Invalid output\n"); + return -EINVAL; + } + + ret = gpio_request(gpio, "gfx"); + if (ret) { + DRM_ERROR("gpio_rqueset failed\n"); + return ret; + } + + ret = gpio_direction_output(gpio, 1); + if (ret) { + DRM_ERROR("gpio_direction_output failed\n"); + goto gpio_error; + } + + gpio_get_value(128); + +gpio_error: + if (gpio_is_valid(gpio)) + gpio_free(gpio); + + return ret; +} + +/* + * MIPI output init + * @dev drm device + * @pipe pipe number. 0 or 2 + * @config + * + * Do the initialization of a MIPI output, including create DRM mode objects + * initialization of DSI output on @pipe + */ +void mdfld_dsi_output_init(struct drm_device *dev, + int pipe, + struct mdfld_dsi_config *config, + const struct panel_funcs *p_vid_funcs) +{ + struct mdfld_dsi_config *dsi_config; + struct mdfld_dsi_connector *dsi_connector; + struct drm_connector *connector; + struct mdfld_dsi_encoder *encoder; + struct drm_psb_private *dev_priv = dev->dev_private; + struct panel_info dsi_panel_info; + u32 width_mm, height_mm; + + dev_dbg(dev->dev, "init DSI output on pipe %d\n", pipe); + + if (!dev || ((pipe != 0) && (pipe != 2))) { + DRM_ERROR("Invalid parameter\n"); + return; + } + + /*create a new connetor*/ + dsi_connector = kzalloc(sizeof(struct mdfld_dsi_connector), GFP_KERNEL); + if (!dsi_connector) { + DRM_ERROR("No memory"); + return; + } + + dsi_connector->pipe = pipe; + + /*set DSI config*/ + if (config) + dsi_config = config; + else { + dsi_config = kzalloc(sizeof(struct mdfld_dsi_config), + GFP_KERNEL); + if (!dsi_config) { + DRM_ERROR("cannot allocate memory for DSI config\n"); + goto dsi_init_err0; + } + mdfld_dsi_get_default_config(dev, dsi_config, pipe); + } + + dsi_connector->private = dsi_config; + + dsi_config->changed = 1; + dsi_config->dev = dev; + + dsi_config->fixed_mode = p_vid_funcs->get_config_mode(dev); + if (p_vid_funcs->get_panel_info(dev, pipe, &dsi_panel_info)) + goto dsi_init_err0; + + width_mm = dsi_panel_info.width_mm; + height_mm = dsi_panel_info.height_mm; + + dsi_config->mode = dsi_config->fixed_mode; + dsi_config->connector = dsi_connector; + + if (!dsi_config->fixed_mode) { + DRM_ERROR("No pannel fixed mode was found\n"); + goto dsi_init_err0; + } + + if (pipe && dev_priv->dsi_configs[0]) { + dsi_config->dvr_ic_inited = 0; + dev_priv->dsi_configs[1] = dsi_config; + } else if (pipe == 0) { + dsi_config->dvr_ic_inited = 1; + dev_priv->dsi_configs[0] = dsi_config; + } else { + DRM_ERROR("Trying to init MIPI1 before MIPI0\n"); + goto dsi_init_err0; + } + + + connector = &dsi_connector->base.base; + drm_connector_init(dev, connector, &mdfld_dsi_connector_funcs, + DRM_MODE_CONNECTOR_LVDS); + drm_connector_helper_add(connector, &mdfld_dsi_connector_helper_funcs); + + connector->display_info.subpixel_order = SubPixelHorizontalRGB; + connector->display_info.width_mm = width_mm; + connector->display_info.height_mm = height_mm; + connector->interlace_allowed = false; + connector->doublescan_allowed = false; + + /*attach properties*/ + drm_connector_attach_property(connector, + dev->mode_config.scaling_mode_property, + DRM_MODE_SCALE_FULLSCREEN); + drm_connector_attach_property(connector, + dev_priv->backlight_property, + MDFLD_DSI_BRIGHTNESS_MAX_LEVEL); + + /*init DSI package sender on this output*/ + if (mdfld_dsi_pkg_sender_init(dsi_connector, pipe)) { + DRM_ERROR("Package Sender initialization failed on pipe %d\n", + pipe); + goto dsi_init_err0; + } + + encoder = mdfld_dsi_dpi_init(dev, dsi_connector, p_vid_funcs); + if (!encoder) { + DRM_ERROR("Create DPI encoder failed\n"); + goto dsi_init_err1; + } + encoder->private = dsi_config; + dsi_config->encoder = encoder; + encoder->base.type = (pipe == 0) ? INTEL_OUTPUT_MIPI : + INTEL_OUTPUT_MIPI2; + drm_sysfs_connector_add(connector); + return; + + /*TODO: add code to destroy outputs on error*/ +dsi_init_err1: + /*destroy sender*/ + mdfld_dsi_pkg_sender_destroy(dsi_connector->pkg_sender); + + drm_connector_cleanup(connector); + + kfree(dsi_config->fixed_mode); + kfree(dsi_config); +dsi_init_err0: + kfree(dsi_connector); +} diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_output.h b/drivers/gpu/drm/gma500/mdfld_dsi_output.h new file mode 100644 index 000000000000..2cdf666536df --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_output.h @@ -0,0 +1,389 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * jim liu + * Jackie Li + */ + +#ifndef __MDFLD_DSI_OUTPUT_H__ +#define __MDFLD_DSI_OUTPUT_H__ + +#include +#include +#include +#include +#include +#include + +#include "psb_drv.h" +#include "psb_intel_drv.h" +#include "psb_intel_reg.h" +#include "mdfld_output.h" + +#include + +#define FLD_MASK(start, end) (((1 << ((start) - (end) + 1)) - 1) << (end)) +#define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end)) +#define FLD_GET(val, start, end) (((val) & FLD_MASK(start, end)) >> (end)) +#define FLD_MOD(orig, val, start, end) \ + (((orig) & ~FLD_MASK(start, end)) | FLD_VAL(val, start, end)) + +#define REG_FLD_MOD(reg, val, start, end) \ + REG_WRITE(reg, FLD_MOD(REG_READ(reg), val, start, end)) + +static inline int REGISTER_FLD_WAIT(struct drm_device *dev, u32 reg, + u32 val, int start, int end) +{ + int t = 100000; + + while (FLD_GET(REG_READ(reg), start, end) != val) { + if (--t == 0) + return 1; + } + + return 0; +} + +#define REG_FLD_WAIT(reg, val, start, end) \ + REGISTER_FLD_WAIT(dev, reg, val, start, end) + +#define REG_BIT_WAIT(reg, val, bitnum) \ + REGISTER_FLD_WAIT(dev, reg, val, bitnum, bitnum) + +#define MDFLD_DSI_BRIGHTNESS_MAX_LEVEL 100 + +#ifdef DEBUG +#define CHECK_PIPE(pipe) ({ \ + const typeof(pipe) __pipe = (pipe); \ + BUG_ON(__pipe != 0 && __pipe != 2); \ + __pipe; }) +#else +#define CHECK_PIPE(pipe) (pipe) +#endif + +/* + * Actual MIPIA->MIPIC reg offset is 0x800, value 0x400 is valid for 0 and 2 + */ +#define REG_OFFSET(pipe) (CHECK_PIPE(pipe) * 0x400) + +/* mdfld DSI controller registers */ +#define MIPI_DEVICE_READY_REG(pipe) (0xb000 + REG_OFFSET(pipe)) +#define MIPI_INTR_STAT_REG(pipe) (0xb004 + REG_OFFSET(pipe)) +#define MIPI_INTR_EN_REG(pipe) (0xb008 + REG_OFFSET(pipe)) +#define MIPI_DSI_FUNC_PRG_REG(pipe) (0xb00c + REG_OFFSET(pipe)) +#define MIPI_HS_TX_TIMEOUT_REG(pipe) (0xb010 + REG_OFFSET(pipe)) +#define MIPI_LP_RX_TIMEOUT_REG(pipe) (0xb014 + REG_OFFSET(pipe)) +#define MIPI_TURN_AROUND_TIMEOUT_REG(pipe) (0xb018 + REG_OFFSET(pipe)) +#define MIPI_DEVICE_RESET_TIMER_REG(pipe) (0xb01c + REG_OFFSET(pipe)) +#define MIPI_DPI_RESOLUTION_REG(pipe) (0xb020 + REG_OFFSET(pipe)) +#define MIPI_DBI_FIFO_THROTTLE_REG(pipe) (0xb024 + REG_OFFSET(pipe)) +#define MIPI_HSYNC_COUNT_REG(pipe) (0xb028 + REG_OFFSET(pipe)) +#define MIPI_HBP_COUNT_REG(pipe) (0xb02c + REG_OFFSET(pipe)) +#define MIPI_HFP_COUNT_REG(pipe) (0xb030 + REG_OFFSET(pipe)) +#define MIPI_HACTIVE_COUNT_REG(pipe) (0xb034 + REG_OFFSET(pipe)) +#define MIPI_VSYNC_COUNT_REG(pipe) (0xb038 + REG_OFFSET(pipe)) +#define MIPI_VBP_COUNT_REG(pipe) (0xb03c + REG_OFFSET(pipe)) +#define MIPI_VFP_COUNT_REG(pipe) (0xb040 + REG_OFFSET(pipe)) +#define MIPI_HIGH_LOW_SWITCH_COUNT_REG(pipe) (0xb044 + REG_OFFSET(pipe)) +#define MIPI_DPI_CONTROL_REG(pipe) (0xb048 + REG_OFFSET(pipe)) +#define MIPI_DPI_DATA_REG(pipe) (0xb04c + REG_OFFSET(pipe)) +#define MIPI_INIT_COUNT_REG(pipe) (0xb050 + REG_OFFSET(pipe)) +#define MIPI_MAX_RETURN_PACK_SIZE_REG(pipe) (0xb054 + REG_OFFSET(pipe)) +#define MIPI_VIDEO_MODE_FORMAT_REG(pipe) (0xb058 + REG_OFFSET(pipe)) +#define MIPI_EOT_DISABLE_REG(pipe) (0xb05c + REG_OFFSET(pipe)) +#define MIPI_LP_BYTECLK_REG(pipe) (0xb060 + REG_OFFSET(pipe)) +#define MIPI_LP_GEN_DATA_REG(pipe) (0xb064 + REG_OFFSET(pipe)) +#define MIPI_HS_GEN_DATA_REG(pipe) (0xb068 + REG_OFFSET(pipe)) +#define MIPI_LP_GEN_CTRL_REG(pipe) (0xb06c + REG_OFFSET(pipe)) +#define MIPI_HS_GEN_CTRL_REG(pipe) (0xb070 + REG_OFFSET(pipe)) +#define MIPI_GEN_FIFO_STAT_REG(pipe) (0xb074 + REG_OFFSET(pipe)) +#define MIPI_HS_LS_DBI_ENABLE_REG(pipe) (0xb078 + REG_OFFSET(pipe)) +#define MIPI_DPHY_PARAM_REG(pipe) (0xb080 + REG_OFFSET(pipe)) +#define MIPI_DBI_BW_CTRL_REG(pipe) (0xb084 + REG_OFFSET(pipe)) +#define MIPI_CLK_LANE_SWITCH_TIME_CNT_REG(pipe) (0xb088 + REG_OFFSET(pipe)) + +#define MIPI_CTRL_REG(pipe) (0xb104 + REG_OFFSET(pipe)) +#define MIPI_DATA_ADD_REG(pipe) (0xb108 + REG_OFFSET(pipe)) +#define MIPI_DATA_LEN_REG(pipe) (0xb10c + REG_OFFSET(pipe)) +#define MIPI_CMD_ADD_REG(pipe) (0xb110 + REG_OFFSET(pipe)) +#define MIPI_CMD_LEN_REG(pipe) (0xb114 + REG_OFFSET(pipe)) + +/* non-uniform reg offset */ +#define MIPI_PORT_CONTROL(pipe) (CHECK_PIPE(pipe) ? MIPI_C : MIPI) + +#define DSI_DEVICE_READY (0x1) +#define DSI_POWER_STATE_ULPS_ENTER (0x2 << 1) +#define DSI_POWER_STATE_ULPS_EXIT (0x1 << 1) +#define DSI_POWER_STATE_ULPS_OFFSET (0x1) + + +#define DSI_ONE_DATA_LANE (0x1) +#define DSI_TWO_DATA_LANE (0x2) +#define DSI_THREE_DATA_LANE (0X3) +#define DSI_FOUR_DATA_LANE (0x4) +#define DSI_DPI_VIRT_CHANNEL_OFFSET (0x3) +#define DSI_DBI_VIRT_CHANNEL_OFFSET (0x5) +#define DSI_DPI_COLOR_FORMAT_RGB565 (0x01 << 7) +#define DSI_DPI_COLOR_FORMAT_RGB666 (0x02 << 7) +#define DSI_DPI_COLOR_FORMAT_RGB666_UNPACK (0x03 << 7) +#define DSI_DPI_COLOR_FORMAT_RGB888 (0x04 << 7) +#define DSI_DBI_COLOR_FORMAT_OPTION2 (0x05 << 13) + +#define DSI_INTR_STATE_RXSOTERROR BIT(0) + +#define DSI_INTR_STATE_SPL_PKG_SENT BIT(30) +#define DSI_INTR_STATE_TE BIT(31) + +#define DSI_HS_TX_TIMEOUT_MASK (0xffffff) + +#define DSI_LP_RX_TIMEOUT_MASK (0xffffff) + +#define DSI_TURN_AROUND_TIMEOUT_MASK (0x3f) + +#define DSI_RESET_TIMER_MASK (0xffff) + +#define DSI_DBI_FIFO_WM_HALF (0x0) +#define DSI_DBI_FIFO_WM_QUARTER (0x1) +#define DSI_DBI_FIFO_WM_LOW (0x2) + +#define DSI_DPI_TIMING_MASK (0xffff) + +#define DSI_INIT_TIMER_MASK (0xffff) + +#define DSI_DBI_RETURN_PACK_SIZE_MASK (0x3ff) + +#define DSI_LP_BYTECLK_MASK (0x0ffff) + +#define DSI_HS_CTRL_GEN_SHORT_W0 (0x03) +#define DSI_HS_CTRL_GEN_SHORT_W1 (0x13) +#define DSI_HS_CTRL_GEN_SHORT_W2 (0x23) +#define DSI_HS_CTRL_GEN_R0 (0x04) +#define DSI_HS_CTRL_GEN_R1 (0x14) +#define DSI_HS_CTRL_GEN_R2 (0x24) +#define DSI_HS_CTRL_GEN_LONG_W (0x29) +#define DSI_HS_CTRL_MCS_SHORT_W0 (0x05) +#define DSI_HS_CTRL_MCS_SHORT_W1 (0x15) +#define DSI_HS_CTRL_MCS_R0 (0x06) +#define DSI_HS_CTRL_MCS_LONG_W (0x39) +#define DSI_HS_CTRL_VC_OFFSET (0x06) +#define DSI_HS_CTRL_WC_OFFSET (0x08) + +#define DSI_FIFO_GEN_HS_DATA_FULL BIT(0) +#define DSI_FIFO_GEN_HS_DATA_HALF_EMPTY BIT(1) +#define DSI_FIFO_GEN_HS_DATA_EMPTY BIT(2) +#define DSI_FIFO_GEN_LP_DATA_FULL BIT(8) +#define DSI_FIFO_GEN_LP_DATA_HALF_EMPTY BIT(9) +#define DSI_FIFO_GEN_LP_DATA_EMPTY BIT(10) +#define DSI_FIFO_GEN_HS_CTRL_FULL BIT(16) +#define DSI_FIFO_GEN_HS_CTRL_HALF_EMPTY BIT(17) +#define DSI_FIFO_GEN_HS_CTRL_EMPTY BIT(18) +#define DSI_FIFO_GEN_LP_CTRL_FULL BIT(24) +#define DSI_FIFO_GEN_LP_CTRL_HALF_EMPTY BIT(25) +#define DSI_FIFO_GEN_LP_CTRL_EMPTY BIT(26) +#define DSI_FIFO_DBI_EMPTY BIT(27) +#define DSI_FIFO_DPI_EMPTY BIT(28) + +#define DSI_DBI_HS_LP_SWITCH_MASK (0x1) + +#define DSI_HS_LP_SWITCH_COUNTER_OFFSET (0x0) +#define DSI_LP_HS_SWITCH_COUNTER_OFFSET (0x16) + +#define DSI_DPI_CTRL_HS_SHUTDOWN (0x00000001) +#define DSI_DPI_CTRL_HS_TURN_ON (0x00000002) + +/*dsi power modes*/ +#define DSI_POWER_MODE_DISPLAY_ON BIT(2) +#define DSI_POWER_MODE_NORMAL_ON BIT(3) +#define DSI_POWER_MODE_SLEEP_OUT BIT(4) +#define DSI_POWER_MODE_PARTIAL_ON BIT(5) +#define DSI_POWER_MODE_IDLE_ON BIT(6) + +enum { + MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_PULSE = 1, + MDFLD_DSI_VIDEO_NON_BURST_MODE_SYNC_EVENTS = 2, + MDFLD_DSI_VIDEO_BURST_MODE = 3, +}; + +#define DSI_DPI_COMPLETE_LAST_LINE BIT(2) +#define DSI_DPI_DISABLE_BTA BIT(3) + +struct mdfld_dsi_connector_state { + u32 mipi_ctrl_reg; +}; + +struct mdfld_dsi_encoder_state { + +}; + +struct mdfld_dsi_connector { + struct psb_intel_connector base; + + int pipe; + void *private; + void *pkg_sender; + + /* Connection status */ + enum drm_connector_status status; +}; + +struct mdfld_dsi_encoder { + struct psb_intel_encoder base; + void *private; +}; + +/* + * DSI config, consists of one DSI connector, two DSI encoders. + * DRM will pick up on DSI encoder basing on differents configs. + */ +struct mdfld_dsi_config { + struct drm_device *dev; + struct drm_display_mode *fixed_mode; + struct drm_display_mode *mode; + + struct mdfld_dsi_connector *connector; + struct mdfld_dsi_encoder *encoder; + + int changed; + + int bpp; + int lane_count; + /*Virtual channel number for this encoder*/ + int channel_num; + /*video mode configure*/ + int video_mode; + + int dvr_ic_inited; +}; + +static inline struct mdfld_dsi_connector *mdfld_dsi_connector( + struct drm_connector *connector) +{ + struct psb_intel_connector *psb_connector; + + psb_connector = to_psb_intel_connector(connector); + + return container_of(psb_connector, struct mdfld_dsi_connector, base); +} + +static inline struct mdfld_dsi_encoder *mdfld_dsi_encoder( + struct drm_encoder *encoder) +{ + struct psb_intel_encoder *psb_encoder; + + psb_encoder = to_psb_intel_encoder(encoder); + + return container_of(psb_encoder, struct mdfld_dsi_encoder, base); +} + +static inline struct mdfld_dsi_config * + mdfld_dsi_get_config(struct mdfld_dsi_connector *connector) +{ + if (!connector) + return NULL; + return (struct mdfld_dsi_config *)connector->private; +} + +static inline void *mdfld_dsi_get_pkg_sender(struct mdfld_dsi_config *config) +{ + struct mdfld_dsi_connector *dsi_connector; + + if (!config) + return NULL; + + dsi_connector = config->connector; + + if (!dsi_connector) + return NULL; + + return dsi_connector->pkg_sender; +} + +static inline struct mdfld_dsi_config * + mdfld_dsi_encoder_get_config(struct mdfld_dsi_encoder *encoder) +{ + if (!encoder) + return NULL; + return (struct mdfld_dsi_config *)encoder->private; +} + +static inline struct mdfld_dsi_connector * + mdfld_dsi_encoder_get_connector(struct mdfld_dsi_encoder *encoder) +{ + struct mdfld_dsi_config *config; + + if (!encoder) + return NULL; + + config = mdfld_dsi_encoder_get_config(encoder); + if (!config) + return NULL; + + return config->connector; +} + +static inline void *mdfld_dsi_encoder_get_pkg_sender( + struct mdfld_dsi_encoder *encoder) +{ + struct mdfld_dsi_config *dsi_config; + + dsi_config = mdfld_dsi_encoder_get_config(encoder); + if (!dsi_config) + return NULL; + + return mdfld_dsi_get_pkg_sender(dsi_config); +} + +static inline int mdfld_dsi_encoder_get_pipe(struct mdfld_dsi_encoder *encoder) +{ + struct mdfld_dsi_connector *connector; + + if (!encoder) + return -1; + + connector = mdfld_dsi_encoder_get_connector(encoder); + if (!connector) + return -1; + return connector->pipe; +} + +/* Export functions */ +extern void mdfld_dsi_gen_fifo_ready(struct drm_device *dev, + u32 gen_fifo_stat_reg, u32 fifo_stat); +extern void mdfld_dsi_brightness_init(struct mdfld_dsi_config *dsi_config, + int pipe); +extern void mdfld_dsi_brightness_control(struct drm_device *dev, int pipe, + int level); +extern void mdfld_dsi_output_init(struct drm_device *dev, + int pipe, + struct mdfld_dsi_config *config, + const struct panel_funcs *p_vid_funcs); +extern void mdfld_dsi_controller_init(struct mdfld_dsi_config *dsi_config, + int pipe); + +extern int mdfld_dsi_get_power_mode(struct mdfld_dsi_config *dsi_config, + u32 *mode, bool hs); +extern int mdfld_dsi_get_diagnostic_result(struct mdfld_dsi_config *dsi_config, + u32 *result, bool hs); +extern int mdfld_dsi_panel_reset(int pipe); + +#endif /*__MDFLD_DSI_OUTPUT_H__*/ diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c new file mode 100644 index 000000000000..f193acec657e --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.c @@ -0,0 +1,694 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jackie Li + */ + +#include + +#include "mdfld_dsi_output.h" +#include "mdfld_dsi_pkg_sender.h" +#include "mdfld_dsi_dpi.h" + +#define MDFLD_DSI_READ_MAX_COUNT 5000 + +enum data_type { + DSI_DT_GENERIC_SHORT_WRITE_0 = 0x03, + DSI_DT_GENERIC_SHORT_WRITE_1 = 0x13, + DSI_DT_GENERIC_SHORT_WRITE_2 = 0x23, + DSI_DT_GENERIC_READ_0 = 0x04, + DSI_DT_GENERIC_READ_1 = 0x14, + DSI_DT_GENERIC_READ_2 = 0x24, + DSI_DT_GENERIC_LONG_WRITE = 0x29, + DSI_DT_DCS_SHORT_WRITE_0 = 0x05, + DSI_DT_DCS_SHORT_WRITE_1 = 0x15, + DSI_DT_DCS_READ = 0x06, + DSI_DT_DCS_LONG_WRITE = 0x39, +}; + +enum { + MDFLD_DSI_PANEL_MODE_SLEEP = 0x1, +}; + +enum { + MDFLD_DSI_PKG_SENDER_FREE = 0x0, + MDFLD_DSI_PKG_SENDER_BUSY = 0x1, +}; + +static const char *const dsi_errors[] = { + "RX SOT Error", + "RX SOT Sync Error", + "RX EOT Sync Error", + "RX Escape Mode Entry Error", + "RX LP TX Sync Error", + "RX HS Receive Timeout Error", + "RX False Control Error", + "RX ECC Single Bit Error", + "RX ECC Multibit Error", + "RX Checksum Error", + "RX DSI Data Type Not Recognised", + "RX DSI VC ID Invalid", + "TX False Control Error", + "TX ECC Single Bit Error", + "TX ECC Multibit Error", + "TX Checksum Error", + "TX DSI Data Type Not Recognised", + "TX DSI VC ID invalid", + "High Contention", + "Low contention", + "DPI FIFO Under run", + "HS TX Timeout", + "LP RX Timeout", + "Turn Around ACK Timeout", + "ACK With No Error", + "RX Invalid TX Length", + "RX Prot Violation", + "HS Generic Write FIFO Full", + "LP Generic Write FIFO Full", + "Generic Read Data Avail" + "Special Packet Sent", + "Tearing Effect", +}; + +static inline int wait_for_gen_fifo_empty(struct mdfld_dsi_pkg_sender *sender, + u32 mask) +{ + struct drm_device *dev = sender->dev; + u32 gen_fifo_stat_reg = sender->mipi_gen_fifo_stat_reg; + int retry = 0xffff; + + while (retry--) { + if ((mask & REG_READ(gen_fifo_stat_reg)) == mask) + return 0; + udelay(100); + } + DRM_ERROR("fifo is NOT empty 0x%08x\n", REG_READ(gen_fifo_stat_reg)); + return -EIO; +} + +static int wait_for_all_fifos_empty(struct mdfld_dsi_pkg_sender *sender) +{ + return wait_for_gen_fifo_empty(sender, (BIT(2) | BIT(10) | BIT(18) | + BIT(26) | BIT(27) | BIT(28))); +} + +static int wait_for_lp_fifos_empty(struct mdfld_dsi_pkg_sender *sender) +{ + return wait_for_gen_fifo_empty(sender, (BIT(10) | BIT(26))); +} + +static int wait_for_hs_fifos_empty(struct mdfld_dsi_pkg_sender *sender) +{ + return wait_for_gen_fifo_empty(sender, (BIT(2) | BIT(18))); +} + +static int handle_dsi_error(struct mdfld_dsi_pkg_sender *sender, u32 mask) +{ + u32 intr_stat_reg = sender->mipi_intr_stat_reg; + struct drm_device *dev = sender->dev; + + dev_dbg(sender->dev->dev, "Handling error 0x%08x\n", mask); + + switch (mask) { + case BIT(0): + case BIT(1): + case BIT(2): + case BIT(3): + case BIT(4): + case BIT(5): + case BIT(6): + case BIT(7): + case BIT(8): + case BIT(9): + case BIT(10): + case BIT(11): + case BIT(12): + case BIT(13): + dev_dbg(sender->dev->dev, "No Action required\n"); + break; + case BIT(14): + /*wait for all fifo empty*/ + /*wait_for_all_fifos_empty(sender)*/; + break; + case BIT(15): + dev_dbg(sender->dev->dev, "No Action required\n"); + break; + case BIT(16): + break; + case BIT(17): + break; + case BIT(18): + case BIT(19): + dev_dbg(sender->dev->dev, "High/Low contention detected\n"); + /*wait for contention recovery time*/ + /*mdelay(10);*/ + /*wait for all fifo empty*/ + if (0) + wait_for_all_fifos_empty(sender); + break; + case BIT(20): + dev_dbg(sender->dev->dev, "No Action required\n"); + break; + case BIT(21): + /*wait for all fifo empty*/ + /*wait_for_all_fifos_empty(sender);*/ + break; + case BIT(22): + break; + case BIT(23): + case BIT(24): + case BIT(25): + case BIT(26): + case BIT(27): + dev_dbg(sender->dev->dev, "HS Gen fifo full\n"); + REG_WRITE(intr_stat_reg, mask); + wait_for_hs_fifos_empty(sender); + break; + case BIT(28): + dev_dbg(sender->dev->dev, "LP Gen fifo full\n"); + REG_WRITE(intr_stat_reg, mask); + wait_for_lp_fifos_empty(sender); + break; + case BIT(29): + case BIT(30): + case BIT(31): + dev_dbg(sender->dev->dev, "No Action required\n"); + break; + } + + if (mask & REG_READ(intr_stat_reg)) + dev_dbg(sender->dev->dev, + "Cannot clean interrupt 0x%08x\n", mask); + return 0; +} + +static int dsi_error_handler(struct mdfld_dsi_pkg_sender *sender) +{ + struct drm_device *dev = sender->dev; + u32 intr_stat_reg = sender->mipi_intr_stat_reg; + u32 mask; + u32 intr_stat; + int i; + int err = 0; + + intr_stat = REG_READ(intr_stat_reg); + + for (i = 0; i < 32; i++) { + mask = (0x00000001UL) << i; + if (intr_stat & mask) { + dev_dbg(sender->dev->dev, "[DSI]: %s\n", dsi_errors[i]); + err = handle_dsi_error(sender, mask); + if (err) + DRM_ERROR("Cannot handle error\n"); + } + } + return err; +} + +static int send_short_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 cmd, u8 param, bool hs) +{ + struct drm_device *dev = sender->dev; + u32 ctrl_reg; + u32 val; + u8 virtual_channel = 0; + + if (hs) { + ctrl_reg = sender->mipi_hs_gen_ctrl_reg; + + /* FIXME: wait_for_hs_fifos_empty(sender); */ + } else { + ctrl_reg = sender->mipi_lp_gen_ctrl_reg; + + /* FIXME: wait_for_lp_fifos_empty(sender); */ + } + + val = FLD_VAL(param, 23, 16) | FLD_VAL(cmd, 15, 8) | + FLD_VAL(virtual_channel, 7, 6) | FLD_VAL(data_type, 5, 0); + + REG_WRITE(ctrl_reg, val); + + return 0; +} + +static int send_long_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 *data, int len, bool hs) +{ + struct drm_device *dev = sender->dev; + u32 ctrl_reg; + u32 data_reg; + u32 val; + u8 *p; + u8 b1, b2, b3, b4; + u8 virtual_channel = 0; + int i; + + if (hs) { + ctrl_reg = sender->mipi_hs_gen_ctrl_reg; + data_reg = sender->mipi_hs_gen_data_reg; + + /* FIXME: wait_for_hs_fifos_empty(sender); */ + } else { + ctrl_reg = sender->mipi_lp_gen_ctrl_reg; + data_reg = sender->mipi_lp_gen_data_reg; + + /* FIXME: wait_for_lp_fifos_empty(sender); */ + } + + p = data; + for (i = 0; i < len / 4; i++) { + b1 = *p++; + b2 = *p++; + b3 = *p++; + b4 = *p++; + + REG_WRITE(data_reg, b4 << 24 | b3 << 16 | b2 << 8 | b1); + } + + i = len % 4; + if (i) { + b1 = 0; b2 = 0; b3 = 0; + + switch (i) { + case 3: + b1 = *p++; + b2 = *p++; + b3 = *p++; + break; + case 2: + b1 = *p++; + b2 = *p++; + break; + case 1: + b1 = *p++; + break; + } + + REG_WRITE(data_reg, b3 << 16 | b2 << 8 | b1); + } + + val = FLD_VAL(len, 23, 8) | FLD_VAL(virtual_channel, 7, 6) | + FLD_VAL(data_type, 5, 0); + + REG_WRITE(ctrl_reg, val); + + return 0; +} + +static int send_pkg_prepare(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 *data, u16 len) +{ + u8 cmd; + + switch (data_type) { + case DSI_DT_DCS_SHORT_WRITE_0: + case DSI_DT_DCS_SHORT_WRITE_1: + case DSI_DT_DCS_LONG_WRITE: + cmd = *data; + break; + default: + return 0; + } + + /*this prevents other package sending while doing msleep*/ + sender->status = MDFLD_DSI_PKG_SENDER_BUSY; + + /*wait for 120 milliseconds in case exit_sleep_mode just be sent*/ + if (unlikely(cmd == DCS_ENTER_SLEEP_MODE)) { + /*TODO: replace it with msleep later*/ + mdelay(120); + } + + if (unlikely(cmd == DCS_EXIT_SLEEP_MODE)) { + /*TODO: replace it with msleep later*/ + mdelay(120); + } + return 0; +} + +static int send_pkg_done(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 *data, u16 len) +{ + u8 cmd; + + switch (data_type) { + case DSI_DT_DCS_SHORT_WRITE_0: + case DSI_DT_DCS_SHORT_WRITE_1: + case DSI_DT_DCS_LONG_WRITE: + cmd = *data; + break; + default: + return 0; + } + + /*update panel status*/ + if (unlikely(cmd == DCS_ENTER_SLEEP_MODE)) { + sender->panel_mode |= MDFLD_DSI_PANEL_MODE_SLEEP; + /*TODO: replace it with msleep later*/ + mdelay(120); + } else if (unlikely(cmd == DCS_EXIT_SLEEP_MODE)) { + sender->panel_mode &= ~MDFLD_DSI_PANEL_MODE_SLEEP; + /*TODO: replace it with msleep later*/ + mdelay(120); + } else if (unlikely(cmd == DCS_SOFT_RESET)) { + /*TODO: replace it with msleep later*/ + mdelay(5); + } + + sender->status = MDFLD_DSI_PKG_SENDER_FREE; + + return 0; +} + +static int send_pkg(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 *data, u16 len, bool hs) +{ + int ret; + + /*handle DSI error*/ + ret = dsi_error_handler(sender); + if (ret) { + DRM_ERROR("Error handling failed\n"); + return -EAGAIN; + } + + /* send pkg */ + if (sender->status == MDFLD_DSI_PKG_SENDER_BUSY) { + DRM_ERROR("sender is busy\n"); + return -EAGAIN; + } + + ret = send_pkg_prepare(sender, data_type, data, len); + if (ret) { + DRM_ERROR("send_pkg_prepare error\n"); + return ret; + } + + switch (data_type) { + case DSI_DT_GENERIC_SHORT_WRITE_0: + case DSI_DT_GENERIC_SHORT_WRITE_1: + case DSI_DT_GENERIC_SHORT_WRITE_2: + case DSI_DT_GENERIC_READ_0: + case DSI_DT_GENERIC_READ_1: + case DSI_DT_GENERIC_READ_2: + case DSI_DT_DCS_SHORT_WRITE_0: + case DSI_DT_DCS_SHORT_WRITE_1: + case DSI_DT_DCS_READ: + ret = send_short_pkg(sender, data_type, data[0], data[1], hs); + break; + case DSI_DT_GENERIC_LONG_WRITE: + case DSI_DT_DCS_LONG_WRITE: + ret = send_long_pkg(sender, data_type, data, len, hs); + break; + } + + send_pkg_done(sender, data_type, data, len); + + /*FIXME: should I query complete and fifo empty here?*/ + + return ret; +} + +int mdfld_dsi_send_mcs_long(struct mdfld_dsi_pkg_sender *sender, u8 *data, + u32 len, bool hs) +{ + unsigned long flags; + + if (!sender || !data || !len) { + DRM_ERROR("Invalid parameters\n"); + return -EINVAL; + } + + spin_lock_irqsave(&sender->lock, flags); + send_pkg(sender, DSI_DT_DCS_LONG_WRITE, data, len, hs); + spin_unlock_irqrestore(&sender->lock, flags); + + return 0; +} + +int mdfld_dsi_send_mcs_short(struct mdfld_dsi_pkg_sender *sender, u8 cmd, + u8 param, u8 param_num, bool hs) +{ + u8 data[2]; + unsigned long flags; + u8 data_type; + + if (!sender) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + data[0] = cmd; + + if (param_num) { + data_type = DSI_DT_DCS_SHORT_WRITE_1; + data[1] = param; + } else { + data_type = DSI_DT_DCS_SHORT_WRITE_0; + data[1] = 0; + } + + spin_lock_irqsave(&sender->lock, flags); + send_pkg(sender, data_type, data, sizeof(data), hs); + spin_unlock_irqrestore(&sender->lock, flags); + + return 0; +} + +int mdfld_dsi_send_gen_short(struct mdfld_dsi_pkg_sender *sender, u8 param0, + u8 param1, u8 param_num, bool hs) +{ + u8 data[2]; + unsigned long flags; + u8 data_type; + + if (!sender || param_num < 0 || param_num > 2) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + switch (param_num) { + case 0: + data_type = DSI_DT_GENERIC_SHORT_WRITE_0; + data[0] = 0; + data[1] = 0; + break; + case 1: + data_type = DSI_DT_GENERIC_SHORT_WRITE_1; + data[0] = param0; + data[1] = 0; + break; + case 2: + data_type = DSI_DT_GENERIC_SHORT_WRITE_2; + data[0] = param0; + data[1] = param1; + break; + } + + spin_lock_irqsave(&sender->lock, flags); + send_pkg(sender, data_type, data, sizeof(data), hs); + spin_unlock_irqrestore(&sender->lock, flags); + + return 0; +} + +int mdfld_dsi_send_gen_long(struct mdfld_dsi_pkg_sender *sender, u8 *data, + u32 len, bool hs) +{ + unsigned long flags; + + if (!sender || !data || !len) { + DRM_ERROR("Invalid parameters\n"); + return -EINVAL; + } + + spin_lock_irqsave(&sender->lock, flags); + send_pkg(sender, DSI_DT_GENERIC_LONG_WRITE, data, len, hs); + spin_unlock_irqrestore(&sender->lock, flags); + + return 0; +} + +static int __read_panel_data(struct mdfld_dsi_pkg_sender *sender, u8 data_type, + u8 *data, u16 len, u32 *data_out, u16 len_out, bool hs) +{ + unsigned long flags; + struct drm_device *dev = sender->dev; + int i; + u32 gen_data_reg; + int retry = MDFLD_DSI_READ_MAX_COUNT; + + if (!sender || !data_out || !len_out) { + DRM_ERROR("Invalid parameters\n"); + return -EINVAL; + } + + /** + * do reading. + * 0) send out generic read request + * 1) polling read data avail interrupt + * 2) read data + */ + spin_lock_irqsave(&sender->lock, flags); + + REG_WRITE(sender->mipi_intr_stat_reg, BIT(29)); + + if ((REG_READ(sender->mipi_intr_stat_reg) & BIT(29))) + DRM_ERROR("Can NOT clean read data valid interrupt\n"); + + /*send out read request*/ + send_pkg(sender, data_type, data, len, hs); + + /*polling read data avail interrupt*/ + while (retry && !(REG_READ(sender->mipi_intr_stat_reg) & BIT(29))) { + udelay(100); + retry--; + } + + if (!retry) { + spin_unlock_irqrestore(&sender->lock, flags); + return -ETIMEDOUT; + } + + REG_WRITE(sender->mipi_intr_stat_reg, BIT(29)); + + /*read data*/ + if (hs) + gen_data_reg = sender->mipi_hs_gen_data_reg; + else + gen_data_reg = sender->mipi_lp_gen_data_reg; + + for (i = 0; i < len_out; i++) + *(data_out + i) = REG_READ(gen_data_reg); + + spin_unlock_irqrestore(&sender->lock, flags); + + return 0; +} + +int mdfld_dsi_read_mcs(struct mdfld_dsi_pkg_sender *sender, u8 cmd, + u32 *data, u16 len, bool hs) +{ + if (!sender || !data || !len) { + DRM_ERROR("Invalid parameters\n"); + return -EINVAL; + } + + return __read_panel_data(sender, DSI_DT_DCS_READ, &cmd, 1, + data, len, hs); +} + +int mdfld_dsi_pkg_sender_init(struct mdfld_dsi_connector *dsi_connector, + int pipe) +{ + struct mdfld_dsi_pkg_sender *pkg_sender; + struct mdfld_dsi_config *dsi_config = + mdfld_dsi_get_config(dsi_connector); + struct drm_device *dev = dsi_config->dev; + u32 mipi_val = 0; + + if (!dsi_connector) { + DRM_ERROR("Invalid parameter\n"); + return -EINVAL; + } + + pkg_sender = dsi_connector->pkg_sender; + + if (!pkg_sender || IS_ERR(pkg_sender)) { + pkg_sender = kzalloc(sizeof(struct mdfld_dsi_pkg_sender), + GFP_KERNEL); + if (!pkg_sender) { + DRM_ERROR("Create DSI pkg sender failed\n"); + return -ENOMEM; + } + dsi_connector->pkg_sender = (void *)pkg_sender; + } + + pkg_sender->dev = dev; + pkg_sender->dsi_connector = dsi_connector; + pkg_sender->pipe = pipe; + pkg_sender->pkg_num = 0; + pkg_sender->panel_mode = 0; + pkg_sender->status = MDFLD_DSI_PKG_SENDER_FREE; + + /*init regs*/ + if (pipe == 0) { + pkg_sender->dpll_reg = MRST_DPLL_A; + pkg_sender->dspcntr_reg = DSPACNTR; + pkg_sender->pipeconf_reg = PIPEACONF; + pkg_sender->dsplinoff_reg = DSPALINOFF; + pkg_sender->dspsurf_reg = DSPASURF; + pkg_sender->pipestat_reg = PIPEASTAT; + } else if (pipe == 2) { + pkg_sender->dpll_reg = MRST_DPLL_A; + pkg_sender->dspcntr_reg = DSPCCNTR; + pkg_sender->pipeconf_reg = PIPECCONF; + pkg_sender->dsplinoff_reg = DSPCLINOFF; + pkg_sender->dspsurf_reg = DSPCSURF; + pkg_sender->pipestat_reg = PIPECSTAT; + } + + pkg_sender->mipi_intr_stat_reg = MIPI_INTR_STAT_REG(pipe); + pkg_sender->mipi_lp_gen_data_reg = MIPI_LP_GEN_DATA_REG(pipe); + pkg_sender->mipi_hs_gen_data_reg = MIPI_HS_GEN_DATA_REG(pipe); + pkg_sender->mipi_lp_gen_ctrl_reg = MIPI_LP_GEN_CTRL_REG(pipe); + pkg_sender->mipi_hs_gen_ctrl_reg = MIPI_HS_GEN_CTRL_REG(pipe); + pkg_sender->mipi_gen_fifo_stat_reg = MIPI_GEN_FIFO_STAT_REG(pipe); + pkg_sender->mipi_data_addr_reg = MIPI_DATA_ADD_REG(pipe); + pkg_sender->mipi_data_len_reg = MIPI_DATA_LEN_REG(pipe); + pkg_sender->mipi_cmd_addr_reg = MIPI_CMD_ADD_REG(pipe); + pkg_sender->mipi_cmd_len_reg = MIPI_CMD_LEN_REG(pipe); + + /*init lock*/ + spin_lock_init(&pkg_sender->lock); + + if (mdfld_get_panel_type(dev, pipe) != TC35876X) { + /** + * For video mode, don't enable DPI timing output here, + * will init the DPI timing output during mode setting. + */ + mipi_val = PASS_FROM_SPHY_TO_AFE | SEL_FLOPPED_HSTX; + + if (pipe == 0) + mipi_val |= 0x2; + + REG_WRITE(MIPI_PORT_CONTROL(pipe), mipi_val); + REG_READ(MIPI_PORT_CONTROL(pipe)); + + /* do dsi controller init */ + mdfld_dsi_controller_init(dsi_config, pipe); + } + + return 0; +} + +void mdfld_dsi_pkg_sender_destroy(struct mdfld_dsi_pkg_sender *sender) +{ + if (!sender || IS_ERR(sender)) + return; + + /*free*/ + kfree(sender); +} + + diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h new file mode 100644 index 000000000000..459cd7ea8b81 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_dsi_pkg_sender.h @@ -0,0 +1,92 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jackie Li + */ +#ifndef __MDFLD_DSI_PKG_SENDER_H__ +#define __MDFLD_DSI_PKG_SENDER_H__ + +#include + +#define MDFLD_MAX_DCS_PARAM 8 + +struct mdfld_dsi_pkg_sender { + struct drm_device *dev; + struct mdfld_dsi_connector *dsi_connector; + u32 status; + u32 panel_mode; + + int pipe; + + spinlock_t lock; + + u32 pkg_num; + + /* Registers */ + u32 dpll_reg; + u32 dspcntr_reg; + u32 pipeconf_reg; + u32 pipestat_reg; + u32 dsplinoff_reg; + u32 dspsurf_reg; + + u32 mipi_intr_stat_reg; + u32 mipi_lp_gen_data_reg; + u32 mipi_hs_gen_data_reg; + u32 mipi_lp_gen_ctrl_reg; + u32 mipi_hs_gen_ctrl_reg; + u32 mipi_gen_fifo_stat_reg; + u32 mipi_data_addr_reg; + u32 mipi_data_len_reg; + u32 mipi_cmd_addr_reg; + u32 mipi_cmd_len_reg; +}; + +/* DCS definitions */ +#define DCS_SOFT_RESET 0x01 +#define DCS_ENTER_SLEEP_MODE 0x10 +#define DCS_EXIT_SLEEP_MODE 0x11 +#define DCS_SET_DISPLAY_OFF 0x28 +#define DCS_SET_DISPLAY_ON 0x29 +#define DCS_SET_COLUMN_ADDRESS 0x2a +#define DCS_SET_PAGE_ADDRESS 0x2b +#define DCS_WRITE_MEM_START 0x2c +#define DCS_SET_TEAR_OFF 0x34 +#define DCS_SET_TEAR_ON 0x35 + +extern int mdfld_dsi_pkg_sender_init(struct mdfld_dsi_connector *dsi_connector, + int pipe); +extern void mdfld_dsi_pkg_sender_destroy(struct mdfld_dsi_pkg_sender *sender); +int mdfld_dsi_send_mcs_short(struct mdfld_dsi_pkg_sender *sender, u8 cmd, + u8 param, u8 param_num, bool hs); +int mdfld_dsi_send_mcs_long(struct mdfld_dsi_pkg_sender *sender, u8 *data, + u32 len, bool hs); +int mdfld_dsi_send_gen_short(struct mdfld_dsi_pkg_sender *sender, u8 param0, + u8 param1, u8 param_num, bool hs); +int mdfld_dsi_send_gen_long(struct mdfld_dsi_pkg_sender *sender, u8 *data, + u32 len, bool hs); +/* Read interfaces */ +int mdfld_dsi_read_mcs(struct mdfld_dsi_pkg_sender *sender, u8 cmd, + u32 *data, u16 len, bool hs); + +#endif diff --git a/drivers/gpu/drm/gma500/mdfld_intel_display.c b/drivers/gpu/drm/gma500/mdfld_intel_display.c new file mode 100644 index 000000000000..55e5af5cbd3d --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_intel_display.c @@ -0,0 +1,1192 @@ +/* + * Copyright © 2006-2007 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Authors: + * Eric Anholt + */ + +#include +#include + +#include +#include "psb_intel_reg.h" +#include "psb_intel_display.h" +#include "framebuffer.h" +#include "mdfld_output.h" +#include "mdfld_dsi_output.h" + +/* Hardcoded currently */ +static int ksel = KSEL_CRYSTAL_19; + +struct psb_intel_range_t { + int min, max; +}; + +struct mrst_limit_t { + struct psb_intel_range_t dot, m, p1; +}; + +struct mrst_clock_t { + /* derived values */ + int dot; + int m; + int p1; +}; + +#define COUNT_MAX 0x10000000 + +void mdfldWaitForPipeDisable(struct drm_device *dev, int pipe) +{ + int count, temp; + u32 pipeconf_reg = PIPEACONF; + + switch (pipe) { + case 0: + break; + case 1: + pipeconf_reg = PIPEBCONF; + break; + case 2: + pipeconf_reg = PIPECCONF; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return; + } + + /* FIXME JLIU7_PO */ + psb_intel_wait_for_vblank(dev); + return; + + /* Wait for for the pipe disable to take effect. */ + for (count = 0; count < COUNT_MAX; count++) { + temp = REG_READ(pipeconf_reg); + if ((temp & PIPEACONF_PIPE_STATE) == 0) + break; + } +} + +void mdfldWaitForPipeEnable(struct drm_device *dev, int pipe) +{ + int count, temp; + u32 pipeconf_reg = PIPEACONF; + + switch (pipe) { + case 0: + break; + case 1: + pipeconf_reg = PIPEBCONF; + break; + case 2: + pipeconf_reg = PIPECCONF; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return; + } + + /* FIXME JLIU7_PO */ + psb_intel_wait_for_vblank(dev); + return; + + /* Wait for for the pipe enable to take effect. */ + for (count = 0; count < COUNT_MAX; count++) { + temp = REG_READ(pipeconf_reg); + if ((temp & PIPEACONF_PIPE_STATE) == 1) + break; + } +} + +static void psb_intel_crtc_prepare(struct drm_crtc *crtc) +{ + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + crtc_funcs->dpms(crtc, DRM_MODE_DPMS_OFF); +} + +static void psb_intel_crtc_commit(struct drm_crtc *crtc) +{ + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + crtc_funcs->dpms(crtc, DRM_MODE_DPMS_ON); +} + +static bool psb_intel_crtc_mode_fixup(struct drm_crtc *crtc, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode) +{ + return true; +} + +/** + * Return the pipe currently connected to the panel fitter, + * or -1 if the panel fitter is not present or not in use + */ +static int psb_intel_panel_fitter_pipe(struct drm_device *dev) +{ + u32 pfit_control; + + pfit_control = REG_READ(PFIT_CONTROL); + + /* See if the panel fitter is in use */ + if ((pfit_control & PFIT_ENABLE) == 0) + return -1; + + /* 965 can place panel fitter on either pipe */ + return (pfit_control >> 29) & 0x3; +} + +static struct drm_device globle_dev; + +void mdfld__intel_plane_set_alpha(int enable) +{ + struct drm_device *dev = &globle_dev; + int dspcntr_reg = DSPACNTR; + u32 dspcntr; + + dspcntr = REG_READ(dspcntr_reg); + + if (enable) { + dspcntr &= ~DISPPLANE_32BPP_NO_ALPHA; + dspcntr |= DISPPLANE_32BPP; + } else { + dspcntr &= ~DISPPLANE_32BPP; + dspcntr |= DISPPLANE_32BPP_NO_ALPHA; + } + + REG_WRITE(dspcntr_reg, dspcntr); +} + +static int check_fb(struct drm_framebuffer *fb) +{ + if (!fb) + return 0; + + switch (fb->bits_per_pixel) { + case 8: + case 16: + case 24: + case 32: + return 0; + default: + DRM_ERROR("Unknown color depth\n"); + return -EINVAL; + } +} + +static int mdfld__intel_pipe_set_base(struct drm_crtc *crtc, int x, int y, + struct drm_framebuffer *old_fb) +{ + struct drm_device *dev = crtc->dev; + /* struct drm_i915_master_private *master_priv; */ + struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc); + struct psb_framebuffer *psbfb = to_psb_fb(crtc->fb); + int pipe = psb_intel_crtc->pipe; + unsigned long start, offset; + int dsplinoff = DSPALINOFF; + int dspsurf = DSPASURF; + int dspstride = DSPASTRIDE; + int dspcntr_reg = DSPACNTR; + u32 dspcntr; + int ret; + + memcpy(&globle_dev, dev, sizeof(struct drm_device)); + + dev_dbg(dev->dev, "pipe = 0x%x.\n", pipe); + + /* no fb bound */ + if (!crtc->fb) { + dev_dbg(dev->dev, "No FB bound\n"); + return 0; + } + + ret = check_fb(crtc->fb); + if (ret) + return ret; + + switch (pipe) { + case 0: + dsplinoff = DSPALINOFF; + break; + case 1: + dsplinoff = DSPBLINOFF; + dspsurf = DSPBSURF; + dspstride = DSPBSTRIDE; + dspcntr_reg = DSPBCNTR; + break; + case 2: + dsplinoff = DSPCLINOFF; + dspsurf = DSPCSURF; + dspstride = DSPCSTRIDE; + dspcntr_reg = DSPCCNTR; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return -EINVAL; + } + + if (!gma_power_begin(dev, true)) + return 0; + + start = psbfb->gtt->offset; + offset = y * crtc->fb->pitches[0] + x * (crtc->fb->bits_per_pixel / 8); + + REG_WRITE(dspstride, crtc->fb->pitches[0]); + dspcntr = REG_READ(dspcntr_reg); + dspcntr &= ~DISPPLANE_PIXFORMAT_MASK; + + switch (crtc->fb->bits_per_pixel) { + case 8: + dspcntr |= DISPPLANE_8BPP; + break; + case 16: + if (crtc->fb->depth == 15) + dspcntr |= DISPPLANE_15_16BPP; + else + dspcntr |= DISPPLANE_16BPP; + break; + case 24: + case 32: + dspcntr |= DISPPLANE_32BPP_NO_ALPHA; + break; + } + REG_WRITE(dspcntr_reg, dspcntr); + + dev_dbg(dev->dev, "Writing base %08lX %08lX %d %d\n", + start, offset, x, y); + REG_WRITE(dsplinoff, offset); + REG_READ(dsplinoff); + REG_WRITE(dspsurf, start); + REG_READ(dspsurf); + + gma_power_end(dev); + + return 0; +} + +/* + * Disable the pipe, plane and pll. + * + */ +void mdfld_disable_crtc(struct drm_device *dev, int pipe) +{ + int dpll_reg = MRST_DPLL_A; + int dspcntr_reg = DSPACNTR; + int dspbase_reg = MRST_DSPABASE; + int pipeconf_reg = PIPEACONF; + u32 temp; + + dev_dbg(dev->dev, "pipe = %d\n", pipe); + + + switch (pipe) { + case 0: + break; + case 1: + dpll_reg = MDFLD_DPLL_B; + dspcntr_reg = DSPBCNTR; + dspbase_reg = DSPBSURF; + pipeconf_reg = PIPEBCONF; + break; + case 2: + dpll_reg = MRST_DPLL_A; + dspcntr_reg = DSPCCNTR; + dspbase_reg = MDFLD_DSPCBASE; + pipeconf_reg = PIPECCONF; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return; + } + + if (pipe != 1) + mdfld_dsi_gen_fifo_ready(dev, MIPI_GEN_FIFO_STAT_REG(pipe), + HS_CTRL_FIFO_EMPTY | HS_DATA_FIFO_EMPTY); + + /* Disable display plane */ + temp = REG_READ(dspcntr_reg); + if ((temp & DISPLAY_PLANE_ENABLE) != 0) { + REG_WRITE(dspcntr_reg, + temp & ~DISPLAY_PLANE_ENABLE); + /* Flush the plane changes */ + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + REG_READ(dspbase_reg); + } + + /* FIXME_JLIU7 MDFLD_PO revisit */ + + /* Next, disable display pipes */ + temp = REG_READ(pipeconf_reg); + if ((temp & PIPEACONF_ENABLE) != 0) { + temp &= ~PIPEACONF_ENABLE; + temp |= PIPECONF_PLANE_OFF | PIPECONF_CURSOR_OFF; + REG_WRITE(pipeconf_reg, temp); + REG_READ(pipeconf_reg); + + /* Wait for for the pipe disable to take effect. */ + mdfldWaitForPipeDisable(dev, pipe); + } + + temp = REG_READ(dpll_reg); + if (temp & DPLL_VCO_ENABLE) { + if ((pipe != 1 && + !((REG_READ(PIPEACONF) | REG_READ(PIPECCONF)) + & PIPEACONF_ENABLE)) || pipe == 1) { + temp &= ~(DPLL_VCO_ENABLE); + REG_WRITE(dpll_reg, temp); + REG_READ(dpll_reg); + /* Wait for the clocks to turn off. */ + /* FIXME_MDFLD PO may need more delay */ + udelay(500); + + if (!(temp & MDFLD_PWR_GATE_EN)) { + /* gating power of DPLL */ + REG_WRITE(dpll_reg, temp | MDFLD_PWR_GATE_EN); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(5000); + } + } + } + +} + +/** + * Sets the power management mode of the pipe and plane. + * + * This code should probably grow support for turning the cursor off and back + * on appropriately at the same time as we're turning the pipe off/on. + */ +static void mdfld_crtc_dpms(struct drm_crtc *crtc, int mode) +{ + struct drm_device *dev = crtc->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc); + int pipe = psb_intel_crtc->pipe; + int dpll_reg = MRST_DPLL_A; + int dspcntr_reg = DSPACNTR; + int dspbase_reg = MRST_DSPABASE; + int pipeconf_reg = PIPEACONF; + u32 pipestat_reg = PIPEASTAT; + u32 pipeconf = dev_priv->pipeconf[pipe]; + u32 temp; + bool enabled; + int timeout = 0; + + dev_dbg(dev->dev, "mode = %d, pipe = %d\n", mode, pipe); + +/* FIXME_JLIU7 MDFLD_PO replaced w/ the following function */ +/* mdfld_dbi_dpms (struct drm_device *dev, int pipe, bool enabled) */ + + switch (pipe) { + case 0: + break; + case 1: + dpll_reg = DPLL_B; + dspcntr_reg = DSPBCNTR; + dspbase_reg = MRST_DSPBBASE; + pipeconf_reg = PIPEBCONF; + dpll_reg = MDFLD_DPLL_B; + break; + case 2: + dpll_reg = MRST_DPLL_A; + dspcntr_reg = DSPCCNTR; + dspbase_reg = MDFLD_DSPCBASE; + pipeconf_reg = PIPECCONF; + pipestat_reg = PIPECSTAT; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return; + } + + if (!gma_power_begin(dev, true)) + return; + + /* XXX: When our outputs are all unaware of DPMS modes other than off + * and on, we should map those modes to DRM_MODE_DPMS_OFF in the CRTC. + */ + switch (mode) { + case DRM_MODE_DPMS_ON: + case DRM_MODE_DPMS_STANDBY: + case DRM_MODE_DPMS_SUSPEND: + /* Enable the DPLL */ + temp = REG_READ(dpll_reg); + + if ((temp & DPLL_VCO_ENABLE) == 0) { + /* When ungating power of DPLL, needs to wait 0.5us + before enable the VCO */ + if (temp & MDFLD_PWR_GATE_EN) { + temp &= ~MDFLD_PWR_GATE_EN; + REG_WRITE(dpll_reg, temp); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + } + + REG_WRITE(dpll_reg, temp); + REG_READ(dpll_reg); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + + REG_WRITE(dpll_reg, temp | DPLL_VCO_ENABLE); + REG_READ(dpll_reg); + + /** + * wait for DSI PLL to lock + * NOTE: only need to poll status of pipe 0 and pipe 1, + * since both MIPI pipes share the same PLL. + */ + while ((pipe != 2) && (timeout < 20000) && + !(REG_READ(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) { + udelay(150); + timeout++; + } + } + + /* Enable the plane */ + temp = REG_READ(dspcntr_reg); + if ((temp & DISPLAY_PLANE_ENABLE) == 0) { + REG_WRITE(dspcntr_reg, + temp | DISPLAY_PLANE_ENABLE); + /* Flush the plane changes */ + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + } + + /* Enable the pipe */ + temp = REG_READ(pipeconf_reg); + if ((temp & PIPEACONF_ENABLE) == 0) { + REG_WRITE(pipeconf_reg, pipeconf); + + /* Wait for for the pipe enable to take effect. */ + mdfldWaitForPipeEnable(dev, pipe); + } + + /*workaround for sighting 3741701 Random X blank display*/ + /*perform w/a in video mode only on pipe A or C*/ + if (pipe == 0 || pipe == 2) { + REG_WRITE(pipestat_reg, REG_READ(pipestat_reg)); + msleep(100); + if (PIPE_VBLANK_STATUS & REG_READ(pipestat_reg)) + dev_dbg(dev->dev, "OK"); + else { + dev_dbg(dev->dev, "STUCK!!!!"); + /*shutdown controller*/ + temp = REG_READ(dspcntr_reg); + REG_WRITE(dspcntr_reg, + temp & ~DISPLAY_PLANE_ENABLE); + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + /*mdfld_dsi_dpi_shut_down(dev, pipe);*/ + REG_WRITE(0xb048, 1); + msleep(100); + temp = REG_READ(pipeconf_reg); + temp &= ~PIPEACONF_ENABLE; + REG_WRITE(pipeconf_reg, temp); + msleep(100); /*wait for pipe disable*/ + REG_WRITE(MIPI_DEVICE_READY_REG(pipe), 0); + msleep(100); + REG_WRITE(0xb004, REG_READ(0xb004)); + /* try to bring the controller back up again*/ + REG_WRITE(MIPI_DEVICE_READY_REG(pipe), 1); + temp = REG_READ(dspcntr_reg); + REG_WRITE(dspcntr_reg, + temp | DISPLAY_PLANE_ENABLE); + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + /*mdfld_dsi_dpi_turn_on(dev, pipe);*/ + REG_WRITE(0xb048, 2); + msleep(100); + temp = REG_READ(pipeconf_reg); + temp |= PIPEACONF_ENABLE; + REG_WRITE(pipeconf_reg, temp); + } + } + + psb_intel_crtc_load_lut(crtc); + + /* Give the overlay scaler a chance to enable + if it's on this pipe */ + /* psb_intel_crtc_dpms_video(crtc, true); TODO */ + + break; + case DRM_MODE_DPMS_OFF: + /* Give the overlay scaler a chance to disable + * if it's on this pipe */ + /* psb_intel_crtc_dpms_video(crtc, FALSE); TODO */ + if (pipe != 1) + mdfld_dsi_gen_fifo_ready(dev, + MIPI_GEN_FIFO_STAT_REG(pipe), + HS_CTRL_FIFO_EMPTY | HS_DATA_FIFO_EMPTY); + + /* Disable the VGA plane that we never use */ + REG_WRITE(VGACNTRL, VGA_DISP_DISABLE); + + /* Disable display plane */ + temp = REG_READ(dspcntr_reg); + if ((temp & DISPLAY_PLANE_ENABLE) != 0) { + REG_WRITE(dspcntr_reg, + temp & ~DISPLAY_PLANE_ENABLE); + /* Flush the plane changes */ + REG_WRITE(dspbase_reg, REG_READ(dspbase_reg)); + REG_READ(dspbase_reg); + } + + /* Next, disable display pipes */ + temp = REG_READ(pipeconf_reg); + if ((temp & PIPEACONF_ENABLE) != 0) { + temp &= ~PIPEACONF_ENABLE; + temp |= PIPECONF_PLANE_OFF | PIPECONF_CURSOR_OFF; + REG_WRITE(pipeconf_reg, temp); + REG_READ(pipeconf_reg); + + /* Wait for for the pipe disable to take effect. */ + mdfldWaitForPipeDisable(dev, pipe); + } + + temp = REG_READ(dpll_reg); + if (temp & DPLL_VCO_ENABLE) { + if ((pipe != 1 && !((REG_READ(PIPEACONF) + | REG_READ(PIPECCONF)) & PIPEACONF_ENABLE)) + || pipe == 1) { + temp &= ~(DPLL_VCO_ENABLE); + REG_WRITE(dpll_reg, temp); + REG_READ(dpll_reg); + /* Wait for the clocks to turn off. */ + /* FIXME_MDFLD PO may need more delay */ + udelay(500); + } + } + break; + } + enabled = crtc->enabled && mode != DRM_MODE_DPMS_OFF; + gma_power_end(dev); +} + + +#define MDFLD_LIMT_DPLL_19 0 +#define MDFLD_LIMT_DPLL_25 1 +#define MDFLD_LIMT_DPLL_83 2 +#define MDFLD_LIMT_DPLL_100 3 +#define MDFLD_LIMT_DSIPLL_19 4 +#define MDFLD_LIMT_DSIPLL_25 5 +#define MDFLD_LIMT_DSIPLL_83 6 +#define MDFLD_LIMT_DSIPLL_100 7 + +#define MDFLD_DOT_MIN 19750 +#define MDFLD_DOT_MAX 120000 +#define MDFLD_DPLL_M_MIN_19 113 +#define MDFLD_DPLL_M_MAX_19 155 +#define MDFLD_DPLL_P1_MIN_19 2 +#define MDFLD_DPLL_P1_MAX_19 10 +#define MDFLD_DPLL_M_MIN_25 101 +#define MDFLD_DPLL_M_MAX_25 130 +#define MDFLD_DPLL_P1_MIN_25 2 +#define MDFLD_DPLL_P1_MAX_25 10 +#define MDFLD_DPLL_M_MIN_83 64 +#define MDFLD_DPLL_M_MAX_83 64 +#define MDFLD_DPLL_P1_MIN_83 2 +#define MDFLD_DPLL_P1_MAX_83 2 +#define MDFLD_DPLL_M_MIN_100 64 +#define MDFLD_DPLL_M_MAX_100 64 +#define MDFLD_DPLL_P1_MIN_100 2 +#define MDFLD_DPLL_P1_MAX_100 2 +#define MDFLD_DSIPLL_M_MIN_19 131 +#define MDFLD_DSIPLL_M_MAX_19 175 +#define MDFLD_DSIPLL_P1_MIN_19 3 +#define MDFLD_DSIPLL_P1_MAX_19 8 +#define MDFLD_DSIPLL_M_MIN_25 97 +#define MDFLD_DSIPLL_M_MAX_25 140 +#define MDFLD_DSIPLL_P1_MIN_25 3 +#define MDFLD_DSIPLL_P1_MAX_25 9 +#define MDFLD_DSIPLL_M_MIN_83 33 +#define MDFLD_DSIPLL_M_MAX_83 92 +#define MDFLD_DSIPLL_P1_MIN_83 2 +#define MDFLD_DSIPLL_P1_MAX_83 3 +#define MDFLD_DSIPLL_M_MIN_100 97 +#define MDFLD_DSIPLL_M_MAX_100 140 +#define MDFLD_DSIPLL_P1_MIN_100 3 +#define MDFLD_DSIPLL_P1_MAX_100 9 + +static const struct mrst_limit_t mdfld_limits[] = { + { /* MDFLD_LIMT_DPLL_19 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DPLL_M_MIN_19, .max = MDFLD_DPLL_M_MAX_19}, + .p1 = {.min = MDFLD_DPLL_P1_MIN_19, .max = MDFLD_DPLL_P1_MAX_19}, + }, + { /* MDFLD_LIMT_DPLL_25 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DPLL_M_MIN_25, .max = MDFLD_DPLL_M_MAX_25}, + .p1 = {.min = MDFLD_DPLL_P1_MIN_25, .max = MDFLD_DPLL_P1_MAX_25}, + }, + { /* MDFLD_LIMT_DPLL_83 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DPLL_M_MIN_83, .max = MDFLD_DPLL_M_MAX_83}, + .p1 = {.min = MDFLD_DPLL_P1_MIN_83, .max = MDFLD_DPLL_P1_MAX_83}, + }, + { /* MDFLD_LIMT_DPLL_100 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DPLL_M_MIN_100, .max = MDFLD_DPLL_M_MAX_100}, + .p1 = {.min = MDFLD_DPLL_P1_MIN_100, .max = MDFLD_DPLL_P1_MAX_100}, + }, + { /* MDFLD_LIMT_DSIPLL_19 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DSIPLL_M_MIN_19, .max = MDFLD_DSIPLL_M_MAX_19}, + .p1 = {.min = MDFLD_DSIPLL_P1_MIN_19, .max = MDFLD_DSIPLL_P1_MAX_19}, + }, + { /* MDFLD_LIMT_DSIPLL_25 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DSIPLL_M_MIN_25, .max = MDFLD_DSIPLL_M_MAX_25}, + .p1 = {.min = MDFLD_DSIPLL_P1_MIN_25, .max = MDFLD_DSIPLL_P1_MAX_25}, + }, + { /* MDFLD_LIMT_DSIPLL_83 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DSIPLL_M_MIN_83, .max = MDFLD_DSIPLL_M_MAX_83}, + .p1 = {.min = MDFLD_DSIPLL_P1_MIN_83, .max = MDFLD_DSIPLL_P1_MAX_83}, + }, + { /* MDFLD_LIMT_DSIPLL_100 */ + .dot = {.min = MDFLD_DOT_MIN, .max = MDFLD_DOT_MAX}, + .m = {.min = MDFLD_DSIPLL_M_MIN_100, .max = MDFLD_DSIPLL_M_MAX_100}, + .p1 = {.min = MDFLD_DSIPLL_P1_MIN_100, .max = MDFLD_DSIPLL_P1_MAX_100}, + }, +}; + +#define MDFLD_M_MIN 21 +#define MDFLD_M_MAX 180 +static const u32 mdfld_m_converts[] = { +/* M configuration table from 9-bit LFSR table */ + 224, 368, 440, 220, 366, 439, 219, 365, 182, 347, /* 21 - 30 */ + 173, 342, 171, 85, 298, 149, 74, 37, 18, 265, /* 31 - 40 */ + 388, 194, 353, 432, 216, 108, 310, 155, 333, 166, /* 41 - 50 */ + 83, 41, 276, 138, 325, 162, 337, 168, 340, 170, /* 51 - 60 */ + 341, 426, 469, 234, 373, 442, 221, 110, 311, 411, /* 61 - 70 */ + 461, 486, 243, 377, 188, 350, 175, 343, 427, 213, /* 71 - 80 */ + 106, 53, 282, 397, 354, 227, 113, 56, 284, 142, /* 81 - 90 */ + 71, 35, 273, 136, 324, 418, 465, 488, 500, 506, /* 91 - 100 */ + 253, 126, 63, 287, 399, 455, 483, 241, 376, 444, /* 101 - 110 */ + 478, 495, 503, 251, 381, 446, 479, 239, 375, 443, /* 111 - 120 */ + 477, 238, 119, 315, 157, 78, 295, 147, 329, 420, /* 121 - 130 */ + 210, 105, 308, 154, 77, 38, 275, 137, 68, 290, /* 131 - 140 */ + 145, 328, 164, 82, 297, 404, 458, 485, 498, 249, /* 141 - 150 */ + 380, 190, 351, 431, 471, 235, 117, 314, 413, 206, /* 151 - 160 */ + 103, 51, 25, 12, 262, 387, 193, 96, 48, 280, /* 161 - 170 */ + 396, 198, 99, 305, 152, 76, 294, 403, 457, 228, /* 171 - 180 */ +}; + +static const struct mrst_limit_t *mdfld_limit(struct drm_crtc *crtc) +{ + const struct mrst_limit_t *limit = NULL; + struct drm_device *dev = crtc->dev; + struct drm_psb_private *dev_priv = dev->dev_private; + + if (psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_MIPI) + || psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_MIPI2)) { + if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19)) + limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_19]; + else if (ksel == KSEL_BYPASS_25) + limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_25]; + else if ((ksel == KSEL_BYPASS_83_100) && + (dev_priv->core_freq == 166)) + limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_83]; + else if ((ksel == KSEL_BYPASS_83_100) && + (dev_priv->core_freq == 100 || + dev_priv->core_freq == 200)) + limit = &mdfld_limits[MDFLD_LIMT_DSIPLL_100]; + } else if (psb_intel_pipe_has_type(crtc, INTEL_OUTPUT_HDMI)) { + if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19)) + limit = &mdfld_limits[MDFLD_LIMT_DPLL_19]; + else if (ksel == KSEL_BYPASS_25) + limit = &mdfld_limits[MDFLD_LIMT_DPLL_25]; + else if ((ksel == KSEL_BYPASS_83_100) && + (dev_priv->core_freq == 166)) + limit = &mdfld_limits[MDFLD_LIMT_DPLL_83]; + else if ((ksel == KSEL_BYPASS_83_100) && + (dev_priv->core_freq == 100 || + dev_priv->core_freq == 200)) + limit = &mdfld_limits[MDFLD_LIMT_DPLL_100]; + } else { + limit = NULL; + dev_dbg(dev->dev, "mdfld_limit Wrong display type.\n"); + } + + return limit; +} + +/** Derive the pixel clock for the given refclk and divisors for 8xx chips. */ +static void mdfld_clock(int refclk, struct mrst_clock_t *clock) +{ + clock->dot = (refclk * clock->m) / clock->p1; +} + +/** + * Returns a set of divisors for the desired target clock with the given refclk, + * or FALSE. Divisor values are the actual divisors for + */ +static bool +mdfldFindBestPLL(struct drm_crtc *crtc, int target, int refclk, + struct mrst_clock_t *best_clock) +{ + struct mrst_clock_t clock; + const struct mrst_limit_t *limit = mdfld_limit(crtc); + int err = target; + + memset(best_clock, 0, sizeof(*best_clock)); + + for (clock.m = limit->m.min; clock.m <= limit->m.max; clock.m++) { + for (clock.p1 = limit->p1.min; clock.p1 <= limit->p1.max; + clock.p1++) { + int this_err; + + mdfld_clock(refclk, &clock); + + this_err = abs(clock.dot - target); + if (this_err < err) { + *best_clock = clock; + err = this_err; + } + } + } + return err != target; +} + +static int mdfld_crtc_mode_set(struct drm_crtc *crtc, + struct drm_display_mode *mode, + struct drm_display_mode *adjusted_mode, + int x, int y, + struct drm_framebuffer *old_fb) +{ + struct drm_device *dev = crtc->dev; + struct psb_intel_crtc *psb_intel_crtc = to_psb_intel_crtc(crtc); + struct drm_psb_private *dev_priv = dev->dev_private; + int pipe = psb_intel_crtc->pipe; + int fp_reg = MRST_FPA0; + int dpll_reg = MRST_DPLL_A; + int dspcntr_reg = DSPACNTR; + int pipeconf_reg = PIPEACONF; + int htot_reg = HTOTAL_A; + int hblank_reg = HBLANK_A; + int hsync_reg = HSYNC_A; + int vtot_reg = VTOTAL_A; + int vblank_reg = VBLANK_A; + int vsync_reg = VSYNC_A; + int dspsize_reg = DSPASIZE; + int dsppos_reg = DSPAPOS; + int pipesrc_reg = PIPEASRC; + u32 *pipeconf = &dev_priv->pipeconf[pipe]; + u32 *dspcntr = &dev_priv->dspcntr[pipe]; + int refclk = 0; + int clk_n = 0, clk_p2 = 0, clk_byte = 1, clk = 0, m_conv = 0, + clk_tmp = 0; + struct mrst_clock_t clock; + bool ok; + u32 dpll = 0, fp = 0; + bool is_crt = false, is_lvds = false, is_tv = false; + bool is_mipi = false, is_mipi2 = false, is_hdmi = false; + struct drm_mode_config *mode_config = &dev->mode_config; + struct psb_intel_encoder *psb_intel_encoder = NULL; + uint64_t scalingType = DRM_MODE_SCALE_FULLSCREEN; + struct drm_encoder *encoder; + struct drm_connector *connector; + int timeout = 0; + int ret; + + dev_dbg(dev->dev, "pipe = 0x%x\n", pipe); + +#if 0 + if (pipe == 1) { + if (!gma_power_begin(dev, true)) + return 0; + android_hdmi_crtc_mode_set(crtc, mode, adjusted_mode, + x, y, old_fb); + goto mrst_crtc_mode_set_exit; + } +#endif + + switch (pipe) { + case 0: + break; + case 1: + fp_reg = FPB0; + dpll_reg = DPLL_B; + dspcntr_reg = DSPBCNTR; + pipeconf_reg = PIPEBCONF; + htot_reg = HTOTAL_B; + hblank_reg = HBLANK_B; + hsync_reg = HSYNC_B; + vtot_reg = VTOTAL_B; + vblank_reg = VBLANK_B; + vsync_reg = VSYNC_B; + dspsize_reg = DSPBSIZE; + dsppos_reg = DSPBPOS; + pipesrc_reg = PIPEBSRC; + fp_reg = MDFLD_DPLL_DIV0; + dpll_reg = MDFLD_DPLL_B; + break; + case 2: + dpll_reg = MRST_DPLL_A; + dspcntr_reg = DSPCCNTR; + pipeconf_reg = PIPECCONF; + htot_reg = HTOTAL_C; + hblank_reg = HBLANK_C; + hsync_reg = HSYNC_C; + vtot_reg = VTOTAL_C; + vblank_reg = VBLANK_C; + vsync_reg = VSYNC_C; + dspsize_reg = DSPCSIZE; + dsppos_reg = DSPCPOS; + pipesrc_reg = PIPECSRC; + break; + default: + DRM_ERROR("Illegal Pipe Number.\n"); + return 0; + } + + ret = check_fb(crtc->fb); + if (ret) + return ret; + + dev_dbg(dev->dev, "adjusted_hdisplay = %d\n", + adjusted_mode->hdisplay); + dev_dbg(dev->dev, "adjusted_vdisplay = %d\n", + adjusted_mode->vdisplay); + dev_dbg(dev->dev, "adjusted_hsync_start = %d\n", + adjusted_mode->hsync_start); + dev_dbg(dev->dev, "adjusted_hsync_end = %d\n", + adjusted_mode->hsync_end); + dev_dbg(dev->dev, "adjusted_htotal = %d\n", + adjusted_mode->htotal); + dev_dbg(dev->dev, "adjusted_vsync_start = %d\n", + adjusted_mode->vsync_start); + dev_dbg(dev->dev, "adjusted_vsync_end = %d\n", + adjusted_mode->vsync_end); + dev_dbg(dev->dev, "adjusted_vtotal = %d\n", + adjusted_mode->vtotal); + dev_dbg(dev->dev, "adjusted_clock = %d\n", + adjusted_mode->clock); + dev_dbg(dev->dev, "hdisplay = %d\n", + mode->hdisplay); + dev_dbg(dev->dev, "vdisplay = %d\n", + mode->vdisplay); + + if (!gma_power_begin(dev, true)) + return 0; + + memcpy(&psb_intel_crtc->saved_mode, mode, + sizeof(struct drm_display_mode)); + memcpy(&psb_intel_crtc->saved_adjusted_mode, adjusted_mode, + sizeof(struct drm_display_mode)); + + list_for_each_entry(connector, &mode_config->connector_list, head) { + if (!connector) + continue; + + encoder = connector->encoder; + + if (!encoder) + continue; + + if (encoder->crtc != crtc) + continue; + + psb_intel_encoder = psb_intel_attached_encoder(connector); + + switch (psb_intel_encoder->type) { + case INTEL_OUTPUT_LVDS: + is_lvds = true; + break; + case INTEL_OUTPUT_TVOUT: + is_tv = true; + break; + case INTEL_OUTPUT_ANALOG: + is_crt = true; + break; + case INTEL_OUTPUT_MIPI: + is_mipi = true; + break; + case INTEL_OUTPUT_MIPI2: + is_mipi2 = true; + break; + case INTEL_OUTPUT_HDMI: + is_hdmi = true; + break; + } + } + + /* Disable the VGA plane that we never use */ + REG_WRITE(VGACNTRL, VGA_DISP_DISABLE); + + /* Disable the panel fitter if it was on our pipe */ + if (psb_intel_panel_fitter_pipe(dev) == pipe) + REG_WRITE(PFIT_CONTROL, 0); + + /* pipesrc and dspsize control the size that is scaled from, + * which should always be the user's requested size. + */ + if (pipe == 1) { + /* FIXME: To make HDMI display with 864x480 (TPO), 480x864 + * (PYR) or 480x854 (TMD), set the sprite width/height and + * souce image size registers with the adjusted mode for + * pipe B. + */ + + /* + * The defined sprite rectangle must always be completely + * contained within the displayable area of the screen image + * (frame buffer). + */ + REG_WRITE(dspsize_reg, ((min(mode->crtc_vdisplay, adjusted_mode->crtc_vdisplay) - 1) << 16) + | (min(mode->crtc_hdisplay, adjusted_mode->crtc_hdisplay) - 1)); + /* Set the CRTC with encoder mode. */ + REG_WRITE(pipesrc_reg, ((mode->crtc_hdisplay - 1) << 16) + | (mode->crtc_vdisplay - 1)); + } else { + REG_WRITE(dspsize_reg, + ((mode->crtc_vdisplay - 1) << 16) | + (mode->crtc_hdisplay - 1)); + REG_WRITE(pipesrc_reg, + ((mode->crtc_hdisplay - 1) << 16) | + (mode->crtc_vdisplay - 1)); + } + + REG_WRITE(dsppos_reg, 0); + + if (psb_intel_encoder) + drm_connector_property_get_value(connector, + dev->mode_config.scaling_mode_property, &scalingType); + + if (scalingType == DRM_MODE_SCALE_NO_SCALE) { + /* Medfield doesn't have register support for centering so we + * need to mess with the h/vblank and h/vsync start and ends + * to get centering + */ + int offsetX = 0, offsetY = 0; + + offsetX = (adjusted_mode->crtc_hdisplay - + mode->crtc_hdisplay) / 2; + offsetY = (adjusted_mode->crtc_vdisplay - + mode->crtc_vdisplay) / 2; + + REG_WRITE(htot_reg, (mode->crtc_hdisplay - 1) | + ((adjusted_mode->crtc_htotal - 1) << 16)); + REG_WRITE(vtot_reg, (mode->crtc_vdisplay - 1) | + ((adjusted_mode->crtc_vtotal - 1) << 16)); + REG_WRITE(hblank_reg, (adjusted_mode->crtc_hblank_start - + offsetX - 1) | + ((adjusted_mode->crtc_hblank_end - offsetX - 1) << 16)); + REG_WRITE(hsync_reg, (adjusted_mode->crtc_hsync_start - + offsetX - 1) | + ((adjusted_mode->crtc_hsync_end - offsetX - 1) << 16)); + REG_WRITE(vblank_reg, (adjusted_mode->crtc_vblank_start - + offsetY - 1) | + ((adjusted_mode->crtc_vblank_end - offsetY - 1) << 16)); + REG_WRITE(vsync_reg, (adjusted_mode->crtc_vsync_start - + offsetY - 1) | + ((adjusted_mode->crtc_vsync_end - offsetY - 1) << 16)); + } else { + REG_WRITE(htot_reg, (adjusted_mode->crtc_hdisplay - 1) | + ((adjusted_mode->crtc_htotal - 1) << 16)); + REG_WRITE(vtot_reg, (adjusted_mode->crtc_vdisplay - 1) | + ((adjusted_mode->crtc_vtotal - 1) << 16)); + REG_WRITE(hblank_reg, (adjusted_mode->crtc_hblank_start - 1) | + ((adjusted_mode->crtc_hblank_end - 1) << 16)); + REG_WRITE(hsync_reg, (adjusted_mode->crtc_hsync_start - 1) | + ((adjusted_mode->crtc_hsync_end - 1) << 16)); + REG_WRITE(vblank_reg, (adjusted_mode->crtc_vblank_start - 1) | + ((adjusted_mode->crtc_vblank_end - 1) << 16)); + REG_WRITE(vsync_reg, (adjusted_mode->crtc_vsync_start - 1) | + ((adjusted_mode->crtc_vsync_end - 1) << 16)); + } + + /* Flush the plane changes */ + { + struct drm_crtc_helper_funcs *crtc_funcs = + crtc->helper_private; + crtc_funcs->mode_set_base(crtc, x, y, old_fb); + } + + /* setup pipeconf */ + *pipeconf = PIPEACONF_ENABLE; /* FIXME_JLIU7 REG_READ(pipeconf_reg); */ + + /* Set up the display plane register */ + *dspcntr = REG_READ(dspcntr_reg); + *dspcntr |= pipe << DISPPLANE_SEL_PIPE_POS; + *dspcntr |= DISPLAY_PLANE_ENABLE; + + if (is_mipi2) + goto mrst_crtc_mode_set_exit; + clk = adjusted_mode->clock; + + if (is_hdmi) { + if ((ksel == KSEL_CRYSTAL_19) || (ksel == KSEL_BYPASS_19)) { + refclk = 19200; + + if (is_mipi || is_mipi2) + clk_n = 1, clk_p2 = 8; + else if (is_hdmi) + clk_n = 1, clk_p2 = 10; + } else if (ksel == KSEL_BYPASS_25) { + refclk = 25000; + + if (is_mipi || is_mipi2) + clk_n = 1, clk_p2 = 8; + else if (is_hdmi) + clk_n = 1, clk_p2 = 10; + } else if ((ksel == KSEL_BYPASS_83_100) && + dev_priv->core_freq == 166) { + refclk = 83000; + + if (is_mipi || is_mipi2) + clk_n = 4, clk_p2 = 8; + else if (is_hdmi) + clk_n = 4, clk_p2 = 10; + } else if ((ksel == KSEL_BYPASS_83_100) && + (dev_priv->core_freq == 100 || + dev_priv->core_freq == 200)) { + refclk = 100000; + if (is_mipi || is_mipi2) + clk_n = 4, clk_p2 = 8; + else if (is_hdmi) + clk_n = 4, clk_p2 = 10; + } + + if (is_mipi) + clk_byte = dev_priv->bpp / 8; + else if (is_mipi2) + clk_byte = dev_priv->bpp2 / 8; + + clk_tmp = clk * clk_n * clk_p2 * clk_byte; + + dev_dbg(dev->dev, "clk = %d, clk_n = %d, clk_p2 = %d.\n", + clk, clk_n, clk_p2); + dev_dbg(dev->dev, "adjusted_mode->clock = %d, clk_tmp = %d.\n", + adjusted_mode->clock, clk_tmp); + + ok = mdfldFindBestPLL(crtc, clk_tmp, refclk, &clock); + + if (!ok) { + DRM_ERROR + ("mdfldFindBestPLL fail in mdfld_crtc_mode_set.\n"); + } else { + m_conv = mdfld_m_converts[(clock.m - MDFLD_M_MIN)]; + + dev_dbg(dev->dev, "dot clock = %d," + "m = %d, p1 = %d, m_conv = %d.\n", + clock.dot, clock.m, + clock.p1, m_conv); + } + + dpll = REG_READ(dpll_reg); + + if (dpll & DPLL_VCO_ENABLE) { + dpll &= ~DPLL_VCO_ENABLE; + REG_WRITE(dpll_reg, dpll); + REG_READ(dpll_reg); + + /* FIXME jliu7 check the DPLL lock bit PIPEACONF[29] */ + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + + /* reset M1, N1 & P1 */ + REG_WRITE(fp_reg, 0); + dpll &= ~MDFLD_P1_MASK; + REG_WRITE(dpll_reg, dpll); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + } + + /* When ungating power of DPLL, needs to wait 0.5us before + * enable the VCO */ + if (dpll & MDFLD_PWR_GATE_EN) { + dpll &= ~MDFLD_PWR_GATE_EN; + REG_WRITE(dpll_reg, dpll); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + } + dpll = 0; + +#if 0 /* FIXME revisit later */ + if (ksel == KSEL_CRYSTAL_19 || ksel == KSEL_BYPASS_19 || + ksel == KSEL_BYPASS_25) + dpll &= ~MDFLD_INPUT_REF_SEL; + else if (ksel == KSEL_BYPASS_83_100) + dpll |= MDFLD_INPUT_REF_SEL; +#endif /* FIXME revisit later */ + + if (is_hdmi) + dpll |= MDFLD_VCO_SEL; + + fp = (clk_n / 2) << 16; + fp |= m_conv; + + /* compute bitmask from p1 value */ + dpll |= (1 << (clock.p1 - 2)) << 17; + +#if 0 /* 1080p30 & 720p */ + dpll = 0x00050000; + fp = 0x000001be; +#endif +#if 0 /* 480p */ + dpll = 0x02010000; + fp = 0x000000d2; +#endif + } else { +#if 0 /*DBI_TPO_480x864*/ + dpll = 0x00020000; + fp = 0x00000156; +#endif /* DBI_TPO_480x864 */ /* get from spec. */ + + dpll = 0x00800000; + fp = 0x000000c1; + } + + REG_WRITE(fp_reg, fp); + REG_WRITE(dpll_reg, dpll); + /* FIXME_MDFLD PO - change 500 to 1 after PO */ + udelay(500); + + dpll |= DPLL_VCO_ENABLE; + REG_WRITE(dpll_reg, dpll); + REG_READ(dpll_reg); + + /* wait for DSI PLL to lock */ + while (timeout < 20000 && + !(REG_READ(pipeconf_reg) & PIPECONF_DSIPLL_LOCK)) { + udelay(150); + timeout++; + } + + if (is_mipi) + goto mrst_crtc_mode_set_exit; + + dev_dbg(dev->dev, "is_mipi = 0x%x\n", is_mipi); + + REG_WRITE(pipeconf_reg, *pipeconf); + REG_READ(pipeconf_reg); + + /* Wait for for the pipe enable to take effect. */ + REG_WRITE(dspcntr_reg, *dspcntr); + psb_intel_wait_for_vblank(dev); + +mrst_crtc_mode_set_exit: + + gma_power_end(dev); + + return 0; +} + +const struct drm_crtc_helper_funcs mdfld_helper_funcs = { + .dpms = mdfld_crtc_dpms, + .mode_fixup = psb_intel_crtc_mode_fixup, + .mode_set = mdfld_crtc_mode_set, + .mode_set_base = mdfld__intel_pipe_set_base, + .prepare = psb_intel_crtc_prepare, + .commit = psb_intel_crtc_commit, +}; + diff --git a/drivers/gpu/drm/gma500/mdfld_output.c b/drivers/gpu/drm/gma500/mdfld_output.c new file mode 100644 index 000000000000..de0ce0765738 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_output.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicensen + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Thomas Eaton + * Scott Rowe +*/ + +#include "mdfld_output.h" +#include "mdfld_dsi_dpi.h" +#include "mdfld_dsi_output.h" + +#include "tc35876x-dsi-lvds.h" + +int mdfld_get_panel_type(struct drm_device *dev, int pipe) +{ + struct drm_psb_private *dev_priv = dev->dev_private; + return dev_priv->mdfld_panel_id; +} + +static void mdfld_init_panel(struct drm_device *dev, int mipi_pipe, + int p_type) +{ + switch (p_type) { + case TPO_VID: + mdfld_dsi_output_init(dev, mipi_pipe, NULL, + &mdfld_tpo_vid_funcs); + break; + case TC35876X: + tc35876x_init(dev); + mdfld_dsi_output_init(dev, mipi_pipe, NULL, + &mdfld_tc35876x_funcs); + break; + case TMD_VID: + mdfld_dsi_output_init(dev, mipi_pipe, NULL, + &mdfld_tmd_vid_funcs); + break; + case HDMI: +/* if (dev_priv->mdfld_hdmi_present) + mdfld_hdmi_init(dev, &dev_priv->mode_dev); */ + break; + } +} + + +int mdfld_output_init(struct drm_device *dev) +{ + struct drm_psb_private *dev_priv = dev->dev_private; + + /* FIXME: hardcoded for now */ + dev_priv->mdfld_panel_id = TC35876X; + /* MIPI panel 1 */ + mdfld_init_panel(dev, 0, dev_priv->mdfld_panel_id); + /* HDMI panel */ + mdfld_init_panel(dev, 1, HDMI); + return 0; +} + diff --git a/drivers/gpu/drm/gma500/mdfld_output.h b/drivers/gpu/drm/gma500/mdfld_output.h new file mode 100644 index 000000000000..ab2b27c0f037 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_output.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicensen + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Thomas Eaton + * Scott Rowe +*/ + +#ifndef MDFLD_OUTPUT_H +#define MDFLD_OUTPUT_H + +#include "psb_drv.h" + +#define TPO_PANEL_WIDTH 84 +#define TPO_PANEL_HEIGHT 46 +#define TMD_PANEL_WIDTH 39 +#define TMD_PANEL_HEIGHT 71 + +struct mdfld_dsi_config; + +enum panel_type { + TPO_VID, + TMD_VID, + HDMI, + TC35876X, +}; + +struct panel_info { + u32 width_mm; + u32 height_mm; + /* Other info */ +}; + +struct panel_funcs { + const struct drm_encoder_funcs *encoder_funcs; + const struct drm_encoder_helper_funcs *encoder_helper_funcs; + struct drm_display_mode * (*get_config_mode)(struct drm_device *); + int (*get_panel_info)(struct drm_device *, int, struct panel_info *); + int (*reset)(int pipe); + void (*drv_ic_init)(struct mdfld_dsi_config *dsi_config, int pipe); +}; + +int mdfld_output_init(struct drm_device *dev); + +struct backlight_device *mdfld_get_backlight_device(void); +int mdfld_set_brightness(struct backlight_device *bd); + +int mdfld_get_panel_type(struct drm_device *dev, int pipe); + +extern const struct drm_crtc_helper_funcs mdfld_helper_funcs; + +extern const struct panel_funcs mdfld_tmd_vid_funcs; +extern const struct panel_funcs mdfld_tpo_vid_funcs; + +extern void mdfld_disable_crtc(struct drm_device *dev, int pipe); +extern void mdfldWaitForPipeEnable(struct drm_device *dev, int pipe); +extern void mdfldWaitForPipeDisable(struct drm_device *dev, int pipe); +#endif diff --git a/drivers/gpu/drm/gma500/mdfld_tmd_vid.c b/drivers/gpu/drm/gma500/mdfld_tmd_vid.c new file mode 100644 index 000000000000..dc0c6c3d3d29 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_tmd_vid.c @@ -0,0 +1,201 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jim Liu + * Jackie Li + * Gideon Eaton + */ + +#include "mdfld_dsi_dpi.h" +#include "mdfld_dsi_pkg_sender.h" + +static struct drm_display_mode *tmd_vid_get_config_mode(struct drm_device *dev) +{ + struct drm_display_mode *mode; + struct drm_psb_private *dev_priv = dev->dev_private; + struct oaktrail_timing_info *ti = &dev_priv->gct_data.DTD; + bool use_gct = false; /*Disable GCT for now*/ + + mode = kzalloc(sizeof(*mode), GFP_KERNEL); + if (!mode) + return NULL; + + if (use_gct) { + mode->hdisplay = (ti->hactive_hi << 8) | ti->hactive_lo; + mode->vdisplay = (ti->vactive_hi << 8) | ti->vactive_lo; + mode->hsync_start = mode->hdisplay + \ + ((ti->hsync_offset_hi << 8) | \ + ti->hsync_offset_lo); + mode->hsync_end = mode->hsync_start + \ + ((ti->hsync_pulse_width_hi << 8) | \ + ti->hsync_pulse_width_lo); + mode->htotal = mode->hdisplay + ((ti->hblank_hi << 8) | \ + ti->hblank_lo); + mode->vsync_start = \ + mode->vdisplay + ((ti->vsync_offset_hi << 8) | \ + ti->vsync_offset_lo); + mode->vsync_end = \ + mode->vsync_start + ((ti->vsync_pulse_width_hi << 8) | \ + ti->vsync_pulse_width_lo); + mode->vtotal = mode->vdisplay + \ + ((ti->vblank_hi << 8) | ti->vblank_lo); + mode->clock = ti->pixel_clock * 10; + + dev_dbg(dev->dev, "hdisplay is %d\n", mode->hdisplay); + dev_dbg(dev->dev, "vdisplay is %d\n", mode->vdisplay); + dev_dbg(dev->dev, "HSS is %d\n", mode->hsync_start); + dev_dbg(dev->dev, "HSE is %d\n", mode->hsync_end); + dev_dbg(dev->dev, "htotal is %d\n", mode->htotal); + dev_dbg(dev->dev, "VSS is %d\n", mode->vsync_start); + dev_dbg(dev->dev, "VSE is %d\n", mode->vsync_end); + dev_dbg(dev->dev, "vtotal is %d\n", mode->vtotal); + dev_dbg(dev->dev, "clock is %d\n", mode->clock); + } else { + mode->hdisplay = 480; + mode->vdisplay = 854; + mode->hsync_start = 487; + mode->hsync_end = 490; + mode->htotal = 499; + mode->vsync_start = 861; + mode->vsync_end = 865; + mode->vtotal = 873; + mode->clock = 33264; + } + + drm_mode_set_name(mode); + drm_mode_set_crtcinfo(mode, 0); + + mode->type |= DRM_MODE_TYPE_PREFERRED; + + return mode; +} + +static int tmd_vid_get_panel_info(struct drm_device *dev, + int pipe, + struct panel_info *pi) +{ + if (!dev || !pi) + return -EINVAL; + + pi->width_mm = TMD_PANEL_WIDTH; + pi->height_mm = TMD_PANEL_HEIGHT; + + return 0; +} + +/* ************************************************************************* *\ + * FUNCTION: mdfld_init_TMD_MIPI + * + * DESCRIPTION: This function is called only by mrst_dsi_mode_set and + * restore_display_registers. since this function does not + * acquire the mutex, it is important that the calling function + * does! +\* ************************************************************************* */ + +/* FIXME: make the below data u8 instead of u32; note byte order! */ +static u32 tmd_cmd_mcap_off[] = {0x000000b2}; +static u32 tmd_cmd_enable_lane_switch[] = {0x000101ef}; +static u32 tmd_cmd_set_lane_num[] = {0x006360ef}; +static u32 tmd_cmd_pushing_clock0[] = {0x00cc2fef}; +static u32 tmd_cmd_pushing_clock1[] = {0x00dd6eef}; +static u32 tmd_cmd_set_mode[] = {0x000000b3}; +static u32 tmd_cmd_set_sync_pulse_mode[] = {0x000961ef}; +static u32 tmd_cmd_set_column[] = {0x0100002a, 0x000000df}; +static u32 tmd_cmd_set_page[] = {0x0300002b, 0x00000055}; +static u32 tmd_cmd_set_video_mode[] = {0x00000153}; +/*no auto_bl,need add in furture*/ +static u32 tmd_cmd_enable_backlight[] = {0x00005ab4}; +static u32 tmd_cmd_set_backlight_dimming[] = {0x00000ebd}; + +static void mdfld_dsi_tmd_drv_ic_init(struct mdfld_dsi_config *dsi_config, + int pipe) +{ + struct mdfld_dsi_pkg_sender *sender + = mdfld_dsi_get_pkg_sender(dsi_config); + + DRM_INFO("Enter mdfld init TMD MIPI display.\n"); + + if (!sender) { + DRM_ERROR("Cannot get sender\n"); + return; + } + + if (dsi_config->dvr_ic_inited) + return; + + msleep(3); + + /* FIXME: make the below data u8 instead of u32; note byte order! */ + + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_mcap_off, + sizeof(tmd_cmd_mcap_off), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_enable_lane_switch, + sizeof(tmd_cmd_enable_lane_switch), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_lane_num, + sizeof(tmd_cmd_set_lane_num), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_pushing_clock0, + sizeof(tmd_cmd_pushing_clock0), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_pushing_clock1, + sizeof(tmd_cmd_pushing_clock1), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_mode, + sizeof(tmd_cmd_set_mode), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_sync_pulse_mode, + sizeof(tmd_cmd_set_sync_pulse_mode), false); + mdfld_dsi_send_mcs_long(sender, (u8 *) tmd_cmd_set_column, + sizeof(tmd_cmd_set_column), false); + mdfld_dsi_send_mcs_long(sender, (u8 *) tmd_cmd_set_page, + sizeof(tmd_cmd_set_page), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_video_mode, + sizeof(tmd_cmd_set_video_mode), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_enable_backlight, + sizeof(tmd_cmd_enable_backlight), false); + mdfld_dsi_send_gen_long(sender, (u8 *) tmd_cmd_set_backlight_dimming, + sizeof(tmd_cmd_set_backlight_dimming), false); + + dsi_config->dvr_ic_inited = 1; +} + +/*TPO DPI encoder helper funcs*/ +static const struct drm_encoder_helper_funcs + mdfld_tpo_dpi_encoder_helper_funcs = { + .dpms = mdfld_dsi_dpi_dpms, + .mode_fixup = mdfld_dsi_dpi_mode_fixup, + .prepare = mdfld_dsi_dpi_prepare, + .mode_set = mdfld_dsi_dpi_mode_set, + .commit = mdfld_dsi_dpi_commit, +}; + +/*TPO DPI encoder funcs*/ +static const struct drm_encoder_funcs mdfld_tpo_dpi_encoder_funcs = { + .destroy = drm_encoder_cleanup, +}; + +const struct panel_funcs mdfld_tmd_vid_funcs = { + .encoder_funcs = &mdfld_tpo_dpi_encoder_funcs, + .encoder_helper_funcs = &mdfld_tpo_dpi_encoder_helper_funcs, + .get_config_mode = &tmd_vid_get_config_mode, + .get_panel_info = tmd_vid_get_panel_info, + .reset = mdfld_dsi_panel_reset, + .drv_ic_init = mdfld_dsi_tmd_drv_ic_init, +}; diff --git a/drivers/gpu/drm/gma500/mdfld_tpo_vid.c b/drivers/gpu/drm/gma500/mdfld_tpo_vid.c new file mode 100644 index 000000000000..d8d4170725b2 --- /dev/null +++ b/drivers/gpu/drm/gma500/mdfld_tpo_vid.c @@ -0,0 +1,124 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * jim liu + * Jackie Li + */ + +#include "mdfld_dsi_dpi.h" + +static struct drm_display_mode *tpo_vid_get_config_mode(struct drm_device *dev) +{ + struct drm_display_mode *mode; + struct drm_psb_private *dev_priv = dev->dev_private; + struct oaktrail_timing_info *ti = &dev_priv->gct_data.DTD; + bool use_gct = false; + + mode = kzalloc(sizeof(*mode), GFP_KERNEL); + if (!mode) + return NULL; + + if (use_gct) { + mode->hdisplay = (ti->hactive_hi << 8) | ti->hactive_lo; + mode->vdisplay = (ti->vactive_hi << 8) | ti->vactive_lo; + mode->hsync_start = mode->hdisplay + + ((ti->hsync_offset_hi << 8) | + ti->hsync_offset_lo); + mode->hsync_end = mode->hsync_start + + ((ti->hsync_pulse_width_hi << 8) | + ti->hsync_pulse_width_lo); + mode->htotal = mode->hdisplay + ((ti->hblank_hi << 8) | + ti->hblank_lo); + mode->vsync_start = + mode->vdisplay + ((ti->vsync_offset_hi << 8) | + ti->vsync_offset_lo); + mode->vsync_end = + mode->vsync_start + ((ti->vsync_pulse_width_hi << 8) | + ti->vsync_pulse_width_lo); + mode->vtotal = mode->vdisplay + + ((ti->vblank_hi << 8) | ti->vblank_lo); + mode->clock = ti->pixel_clock * 10; + + dev_dbg(dev->dev, "hdisplay is %d\n", mode->hdisplay); + dev_dbg(dev->dev, "vdisplay is %d\n", mode->vdisplay); + dev_dbg(dev->dev, "HSS is %d\n", mode->hsync_start); + dev_dbg(dev->dev, "HSE is %d\n", mode->hsync_end); + dev_dbg(dev->dev, "htotal is %d\n", mode->htotal); + dev_dbg(dev->dev, "VSS is %d\n", mode->vsync_start); + dev_dbg(dev->dev, "VSE is %d\n", mode->vsync_end); + dev_dbg(dev->dev, "vtotal is %d\n", mode->vtotal); + dev_dbg(dev->dev, "clock is %d\n", mode->clock); + } else { + mode->hdisplay = 864; + mode->vdisplay = 480; + mode->hsync_start = 873; + mode->hsync_end = 876; + mode->htotal = 887; + mode->vsync_start = 487; + mode->vsync_end = 490; + mode->vtotal = 499; + mode->clock = 33264; + } + + drm_mode_set_name(mode); + drm_mode_set_crtcinfo(mode, 0); + + mode->type |= DRM_MODE_TYPE_PREFERRED; + + return mode; +} + +static int tpo_vid_get_panel_info(struct drm_device *dev, + int pipe, + struct panel_info *pi) +{ + if (!dev || !pi) + return -EINVAL; + + pi->width_mm = TPO_PANEL_WIDTH; + pi->height_mm = TPO_PANEL_HEIGHT; + + return 0; +} + +/*TPO DPI encoder helper funcs*/ +static const struct drm_encoder_helper_funcs + mdfld_tpo_dpi_encoder_helper_funcs = { + .dpms = mdfld_dsi_dpi_dpms, + .mode_fixup = mdfld_dsi_dpi_mode_fixup, + .prepare = mdfld_dsi_dpi_prepare, + .mode_set = mdfld_dsi_dpi_mode_set, + .commit = mdfld_dsi_dpi_commit, +}; + +/*TPO DPI encoder funcs*/ +static const struct drm_encoder_funcs mdfld_tpo_dpi_encoder_funcs = { + .destroy = drm_encoder_cleanup, +}; + +const struct panel_funcs mdfld_tpo_vid_funcs = { + .encoder_funcs = &mdfld_tpo_dpi_encoder_funcs, + .encoder_helper_funcs = &mdfld_tpo_dpi_encoder_helper_funcs, + .get_config_mode = &tpo_vid_get_config_mode, + .get_panel_info = tpo_vid_get_panel_info, +}; diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c index 1f57aac2cf80..fc3293049fe7 100644 --- a/drivers/gpu/drm/gma500/psb_drv.c +++ b/drivers/gpu/drm/gma500/psb_drv.c @@ -60,6 +60,16 @@ static DEFINE_PCI_DEVICE_TABLE(pciidlist) = { /* Atom E620 */ { 0x8086, 0x4108, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &oaktrail_chip_ops}, #endif +#if defined(CONFIG_DRM_MEDFIELD) + {0x8086, 0x0130, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0131, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0132, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0133, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0134, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0135, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0136, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, + {0x8086, 0x0137, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &mdfld_chip_ops}, +#endif #if defined(CONFIG_DRM_GMA3600) { 0x8086, 0x0be0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &cdv_chip_ops}, { 0x8086, 0x0be1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, (long) &cdv_chip_ops}, diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h index 3c0bf7be2738..af1c99752000 100644 --- a/drivers/gpu/drm/gma500/psb_drv.h +++ b/drivers/gpu/drm/gma500/psb_drv.h @@ -389,11 +389,79 @@ struct psb_state { uint32_t savePWM_CONTROL_LOGIC; }; +struct medfield_state { + uint32_t saveDPLL_A; + uint32_t saveFPA0; + uint32_t savePIPEACONF; + uint32_t saveHTOTAL_A; + uint32_t saveHBLANK_A; + uint32_t saveHSYNC_A; + uint32_t saveVTOTAL_A; + uint32_t saveVBLANK_A; + uint32_t saveVSYNC_A; + uint32_t savePIPEASRC; + uint32_t saveDSPASTRIDE; + uint32_t saveDSPALINOFF; + uint32_t saveDSPATILEOFF; + uint32_t saveDSPASIZE; + uint32_t saveDSPAPOS; + uint32_t saveDSPASURF; + uint32_t saveDSPACNTR; + uint32_t saveDSPASTATUS; + uint32_t save_palette_a[256]; + uint32_t saveMIPI; + + uint32_t saveDPLL_B; + uint32_t saveFPB0; + uint32_t savePIPEBCONF; + uint32_t saveHTOTAL_B; + uint32_t saveHBLANK_B; + uint32_t saveHSYNC_B; + uint32_t saveVTOTAL_B; + uint32_t saveVBLANK_B; + uint32_t saveVSYNC_B; + uint32_t savePIPEBSRC; + uint32_t saveDSPBSTRIDE; + uint32_t saveDSPBLINOFF; + uint32_t saveDSPBTILEOFF; + uint32_t saveDSPBSIZE; + uint32_t saveDSPBPOS; + uint32_t saveDSPBSURF; + uint32_t saveDSPBCNTR; + uint32_t saveDSPBSTATUS; + uint32_t save_palette_b[256]; + + uint32_t savePIPECCONF; + uint32_t saveHTOTAL_C; + uint32_t saveHBLANK_C; + uint32_t saveHSYNC_C; + uint32_t saveVTOTAL_C; + uint32_t saveVBLANK_C; + uint32_t saveVSYNC_C; + uint32_t savePIPECSRC; + uint32_t saveDSPCSTRIDE; + uint32_t saveDSPCLINOFF; + uint32_t saveDSPCTILEOFF; + uint32_t saveDSPCSIZE; + uint32_t saveDSPCPOS; + uint32_t saveDSPCSURF; + uint32_t saveDSPCCNTR; + uint32_t saveDSPCSTATUS; + uint32_t save_palette_c[256]; + uint32_t saveMIPI_C; + + uint32_t savePFIT_CONTROL; + uint32_t savePFIT_PGM_RATIOS; + uint32_t saveHDMIPHYMISCCTL; + uint32_t saveHDMIB_CONTROL; +}; + struct psb_save_area { uint32_t saveBSM; uint32_t saveVBT; union { struct psb_state psb; + struct medfield_state mdfld; }; uint32_t saveBLC_PWM_CTL2; uint32_t saveBLC_PWM_CTL; @@ -563,6 +631,24 @@ struct drm_psb_private { /* 2D acceleration */ spinlock_t lock_2d; + + /* + * Panel brightness + */ + int brightness; + int brightness_adjusted; + + bool dsr_enable; + u32 dsr_fb_update; + bool dpi_panel_on[3]; + void *dsi_configs[2]; + u32 bpp; + u32 bpp2; + + u32 pipeconf[3]; + u32 dspcntr[3]; + + int mdfld_panel_id; }; @@ -758,6 +844,9 @@ extern const struct psb_ops psb_chip_ops; /* oaktrail_device.c */ extern const struct psb_ops oaktrail_chip_ops; +/* mdlfd_device.c */ +extern const struct psb_ops mdfld_chip_ops; + /* cdv_device.c */ extern const struct psb_ops cdv_chip_ops; diff --git a/drivers/gpu/drm/gma500/psb_irq.c b/drivers/gpu/drm/gma500/psb_irq.c index 7be802baceb5..a86fc3c4bf3a 100644 --- a/drivers/gpu/drm/gma500/psb_irq.c +++ b/drivers/gpu/drm/gma500/psb_irq.c @@ -27,6 +27,8 @@ #include "psb_reg.h" #include "psb_intel_reg.h" #include "power.h" +#include "psb_irq.h" +#include "mdfld_output.h" /* * inline functions @@ -453,6 +455,11 @@ int psb_enable_vblank(struct drm_device *dev, int pipe) uint32_t reg_val = 0; uint32_t pipeconf_reg = mid_pipeconf(pipe); + /* Medfield is different - we should perhaps extract out vblank + and blacklight etc ops */ + if (IS_MFLD(dev)) + return mdfld_enable_te(dev, pipe); + if (gma_power_begin(dev, false)) { reg_val = REG_READ(pipeconf_reg); gma_power_end(dev); @@ -485,6 +492,8 @@ void psb_disable_vblank(struct drm_device *dev, int pipe) struct drm_psb_private *dev_priv = dev->dev_private; unsigned long irqflags; + if (IS_MFLD(dev)) + mdfld_disable_te(dev, pipe); spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags); if (pipe == 0) @@ -499,6 +508,55 @@ void psb_disable_vblank(struct drm_device *dev, int pipe) spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); } +/* + * It is used to enable TE interrupt + */ +int mdfld_enable_te(struct drm_device *dev, int pipe) +{ + struct drm_psb_private *dev_priv = + (struct drm_psb_private *) dev->dev_private; + unsigned long irqflags; + uint32_t reg_val = 0; + uint32_t pipeconf_reg = mid_pipeconf(pipe); + + if (gma_power_begin(dev, false)) { + reg_val = REG_READ(pipeconf_reg); + gma_power_end(dev); + } + + if (!(reg_val & PIPEACONF_ENABLE)) + return -EINVAL; + + spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags); + + mid_enable_pipe_event(dev_priv, pipe); + psb_enable_pipestat(dev_priv, pipe, PIPE_TE_ENABLE); + + spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); + + return 0; +} + +/* + * It is used to disable TE interrupt + */ +void mdfld_disable_te(struct drm_device *dev, int pipe) +{ + struct drm_psb_private *dev_priv = + (struct drm_psb_private *) dev->dev_private; + unsigned long irqflags; + + if (!dev_priv->dsr_enable) + return; + + spin_lock_irqsave(&dev_priv->irqmask_lock, irqflags); + + mid_disable_pipe_event(dev_priv, pipe); + psb_disable_pipestat(dev_priv, pipe, PIPE_TE_ENABLE); + + spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); +} + /* Called from drm generic code, passed a 'crtc', which * we use as a pipe index */ diff --git a/drivers/gpu/drm/gma500/psb_irq.h b/drivers/gpu/drm/gma500/psb_irq.h index 216fda38b57d..603045bee58a 100644 --- a/drivers/gpu/drm/gma500/psb_irq.h +++ b/drivers/gpu/drm/gma500/psb_irq.h @@ -42,4 +42,6 @@ int psb_enable_vblank(struct drm_device *dev, int pipe); void psb_disable_vblank(struct drm_device *dev, int pipe); u32 psb_get_vblank_counter(struct drm_device *dev, int pipe); +int mdfld_enable_te(struct drm_device *dev, int pipe); +void mdfld_disable_te(struct drm_device *dev, int pipe); #endif /* _SYSIRQ_H_ */ diff --git a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c new file mode 100644 index 000000000000..4a07ab596174 --- /dev/null +++ b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c @@ -0,0 +1,829 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#include "mdfld_dsi_dpi.h" +#include "mdfld_output.h" +#include "mdfld_dsi_pkg_sender.h" +#include "tc35876x-dsi-lvds.h" +#include +#include +#include +#include + +static struct i2c_client *tc35876x_client; +static struct i2c_client *cmi_lcd_i2c_client; + +#define FLD_MASK(start, end) (((1 << ((start) - (end) + 1)) - 1) << (end)) +#define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end)) + +/* DSI D-PHY Layer Registers */ +#define D0W_DPHYCONTTX 0x0004 +#define CLW_DPHYCONTRX 0x0020 +#define D0W_DPHYCONTRX 0x0024 +#define D1W_DPHYCONTRX 0x0028 +#define D2W_DPHYCONTRX 0x002C +#define D3W_DPHYCONTRX 0x0030 +#define COM_DPHYCONTRX 0x0038 +#define CLW_CNTRL 0x0040 +#define D0W_CNTRL 0x0044 +#define D1W_CNTRL 0x0048 +#define D2W_CNTRL 0x004C +#define D3W_CNTRL 0x0050 +#define DFTMODE_CNTRL 0x0054 + +/* DSI PPI Layer Registers */ +#define PPI_STARTPPI 0x0104 +#define PPI_BUSYPPI 0x0108 +#define PPI_LINEINITCNT 0x0110 +#define PPI_LPTXTIMECNT 0x0114 +#define PPI_LANEENABLE 0x0134 +#define PPI_TX_RX_TA 0x013C +#define PPI_CLS_ATMR 0x0140 +#define PPI_D0S_ATMR 0x0144 +#define PPI_D1S_ATMR 0x0148 +#define PPI_D2S_ATMR 0x014C +#define PPI_D3S_ATMR 0x0150 +#define PPI_D0S_CLRSIPOCOUNT 0x0164 +#define PPI_D1S_CLRSIPOCOUNT 0x0168 +#define PPI_D2S_CLRSIPOCOUNT 0x016C +#define PPI_D3S_CLRSIPOCOUNT 0x0170 +#define CLS_PRE 0x0180 +#define D0S_PRE 0x0184 +#define D1S_PRE 0x0188 +#define D2S_PRE 0x018C +#define D3S_PRE 0x0190 +#define CLS_PREP 0x01A0 +#define D0S_PREP 0x01A4 +#define D1S_PREP 0x01A8 +#define D2S_PREP 0x01AC +#define D3S_PREP 0x01B0 +#define CLS_ZERO 0x01C0 +#define D0S_ZERO 0x01C4 +#define D1S_ZERO 0x01C8 +#define D2S_ZERO 0x01CC +#define D3S_ZERO 0x01D0 +#define PPI_CLRFLG 0x01E0 +#define PPI_CLRSIPO 0x01E4 +#define HSTIMEOUT 0x01F0 +#define HSTIMEOUTENABLE 0x01F4 + +/* DSI Protocol Layer Registers */ +#define DSI_STARTDSI 0x0204 +#define DSI_BUSYDSI 0x0208 +#define DSI_LANEENABLE 0x0210 +#define DSI_LANESTATUS0 0x0214 +#define DSI_LANESTATUS1 0x0218 +#define DSI_INTSTATUS 0x0220 +#define DSI_INTMASK 0x0224 +#define DSI_INTCLR 0x0228 +#define DSI_LPTXTO 0x0230 + +/* DSI General Registers */ +#define DSIERRCNT 0x0300 + +/* DSI Application Layer Registers */ +#define APLCTRL 0x0400 +#define RDPKTLN 0x0404 + +/* Video Path Registers */ +#define VPCTRL 0x0450 +#define HTIM1 0x0454 +#define HTIM2 0x0458 +#define VTIM1 0x045C +#define VTIM2 0x0460 +#define VFUEN 0x0464 + +/* LVDS Registers */ +#define LVMX0003 0x0480 +#define LVMX0407 0x0484 +#define LVMX0811 0x0488 +#define LVMX1215 0x048C +#define LVMX1619 0x0490 +#define LVMX2023 0x0494 +#define LVMX2427 0x0498 +#define LVCFG 0x049C +#define LVPHY0 0x04A0 +#define LVPHY1 0x04A4 + +/* System Registers */ +#define SYSSTAT 0x0500 +#define SYSRST 0x0504 + +/* GPIO Registers */ +/*#define GPIOC 0x0520*/ +#define GPIOO 0x0524 +#define GPIOI 0x0528 + +/* I2C Registers */ +#define I2CTIMCTRL 0x0540 +#define I2CMADDR 0x0544 +#define WDATAQ 0x0548 +#define RDATAQ 0x054C + +/* Chip/Rev Registers */ +#define IDREG 0x0580 + +/* Debug Registers */ +#define DEBUG00 0x05A0 +#define DEBUG01 0x05A4 + +/* Panel CABC registers */ +#define PANEL_PWM_CONTROL 0x90 +#define PANEL_FREQ_DIVIDER_HI 0x91 +#define PANEL_FREQ_DIVIDER_LO 0x92 +#define PANEL_DUTY_CONTROL 0x93 +#define PANEL_MODIFY_RGB 0x94 +#define PANEL_FRAMERATE_CONTROL 0x96 +#define PANEL_PWM_MIN 0x97 +#define PANEL_PWM_REF 0x98 +#define PANEL_PWM_MAX 0x99 +#define PANEL_ALLOW_DISTORT 0x9A +#define PANEL_BYPASS_PWMI 0x9B + +/* Panel color management registers */ +#define PANEL_CM_ENABLE 0x700 +#define PANEL_CM_HUE 0x701 +#define PANEL_CM_SATURATION 0x702 +#define PANEL_CM_INTENSITY 0x703 +#define PANEL_CM_BRIGHTNESS 0x704 +#define PANEL_CM_CE_ENABLE 0x705 +#define PANEL_CM_PEAK_EN 0x710 +#define PANEL_CM_GAIN 0x711 +#define PANEL_CM_HUETABLE_START 0x730 +#define PANEL_CM_HUETABLE_END 0x747 /* inclusive */ + +/* Input muxing for registers LVMX0003...LVMX2427 */ +enum { + INPUT_R0, /* 0 */ + INPUT_R1, + INPUT_R2, + INPUT_R3, + INPUT_R4, + INPUT_R5, + INPUT_R6, + INPUT_R7, + INPUT_G0, /* 8 */ + INPUT_G1, + INPUT_G2, + INPUT_G3, + INPUT_G4, + INPUT_G5, + INPUT_G6, + INPUT_G7, + INPUT_B0, /* 16 */ + INPUT_B1, + INPUT_B2, + INPUT_B3, + INPUT_B4, + INPUT_B5, + INPUT_B6, + INPUT_B7, + INPUT_HSYNC, /* 24 */ + INPUT_VSYNC, + INPUT_DE, + LOGIC_0, + /* 28...31 undefined */ +}; + +#define INPUT_MUX(lvmx03, lvmx02, lvmx01, lvmx00) \ + (FLD_VAL(lvmx03, 29, 24) | FLD_VAL(lvmx02, 20, 16) | \ + FLD_VAL(lvmx01, 12, 8) | FLD_VAL(lvmx00, 4, 0)) + +/** + * tc35876x_regw - Write DSI-LVDS bridge register using I2C + * @client: struct i2c_client to use + * @reg: register address + * @value: value to write + * + * Returns 0 on success, or a negative error value. + */ +static int tc35876x_regw(struct i2c_client *client, u16 reg, u32 value) +{ + int r; + u8 tx_data[] = { + /* NOTE: Register address big-endian, data little-endian. */ + (reg >> 8) & 0xff, + reg & 0xff, + value & 0xff, + (value >> 8) & 0xff, + (value >> 16) & 0xff, + (value >> 24) & 0xff, + }; + struct i2c_msg msgs[] = { + { + .addr = client->addr, + .flags = 0, + .buf = tx_data, + .len = ARRAY_SIZE(tx_data), + }, + }; + + r = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); + if (r < 0) { + dev_err(&client->dev, "%s: reg 0x%04x val 0x%08x error %d\n", + __func__, reg, value, r); + return r; + } + + if (r < ARRAY_SIZE(msgs)) { + dev_err(&client->dev, "%s: reg 0x%04x val 0x%08x msgs %d\n", + __func__, reg, value, r); + return -EAGAIN; + } + + dev_dbg(&client->dev, "%s: reg 0x%04x val 0x%08x\n", + __func__, reg, value); + + return 0; +} + +/** + * tc35876x_regr - Read DSI-LVDS bridge register using I2C + * @client: struct i2c_client to use + * @reg: register address + * @value: pointer for storing the value + * + * Returns 0 on success, or a negative error value. + */ +static int tc35876x_regr(struct i2c_client *client, u16 reg, u32 *value) +{ + int r; + u8 tx_data[] = { + (reg >> 8) & 0xff, + reg & 0xff, + }; + u8 rx_data[4]; + struct i2c_msg msgs[] = { + { + .addr = client->addr, + .flags = 0, + .buf = tx_data, + .len = ARRAY_SIZE(tx_data), + }, + { + .addr = client->addr, + .flags = I2C_M_RD, + .buf = rx_data, + .len = ARRAY_SIZE(rx_data), + }, + }; + + r = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); + if (r < 0) { + dev_err(&client->dev, "%s: reg 0x%04x error %d\n", __func__, + reg, r); + return r; + } + + if (r < ARRAY_SIZE(msgs)) { + dev_err(&client->dev, "%s: reg 0x%04x msgs %d\n", __func__, + reg, r); + return -EAGAIN; + } + + *value = rx_data[0] << 24 | rx_data[1] << 16 | + rx_data[2] << 8 | rx_data[3]; + + dev_dbg(&client->dev, "%s: reg 0x%04x value 0x%08x\n", __func__, + reg, *value); + + return 0; +} + +void tc35876x_set_bridge_reset_state(struct drm_device *dev, int state) +{ + struct tc35876x_platform_data *pdata; + + if (WARN(!tc35876x_client, "%s called before probe", __func__)) + return; + + dev_dbg(&tc35876x_client->dev, "%s: state %d\n", __func__, state); + + pdata = dev_get_platdata(&tc35876x_client->dev); + + if (pdata->gpio_bridge_reset == -1) + return; + + if (state) { + gpio_set_value_cansleep(pdata->gpio_bridge_reset, 0); + mdelay(10); + } else { + /* Pull MIPI Bridge reset pin to Low */ + gpio_set_value_cansleep(pdata->gpio_bridge_reset, 0); + mdelay(20); + /* Pull MIPI Bridge reset pin to High */ + gpio_set_value_cansleep(pdata->gpio_bridge_reset, 1); + mdelay(40); + } +} + +void tc35876x_configure_lvds_bridge(struct drm_device *dev) +{ + struct i2c_client *i2c = tc35876x_client; + u32 ppi_lptxtimecnt; + u32 txtagocnt; + u32 txtasurecnt; + u32 id; + + if (WARN(!tc35876x_client, "%s called before probe", __func__)) + return; + + dev_dbg(&tc35876x_client->dev, "%s\n", __func__); + + if (!tc35876x_regr(i2c, IDREG, &id)) + dev_info(&tc35876x_client->dev, "tc35876x ID 0x%08x\n", id); + else + dev_err(&tc35876x_client->dev, "Cannot read ID\n"); + + ppi_lptxtimecnt = 4; + txtagocnt = (5 * ppi_lptxtimecnt - 3) / 4; + txtasurecnt = 3 * ppi_lptxtimecnt / 2; + tc35876x_regw(i2c, PPI_TX_RX_TA, FLD_VAL(txtagocnt, 26, 16) | + FLD_VAL(txtasurecnt, 10, 0)); + tc35876x_regw(i2c, PPI_LPTXTIMECNT, FLD_VAL(ppi_lptxtimecnt, 10, 0)); + + tc35876x_regw(i2c, PPI_D0S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0)); + tc35876x_regw(i2c, PPI_D1S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0)); + tc35876x_regw(i2c, PPI_D2S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0)); + tc35876x_regw(i2c, PPI_D3S_CLRSIPOCOUNT, FLD_VAL(1, 5, 0)); + + /* Enabling MIPI & PPI lanes, Enable 4 lanes */ + tc35876x_regw(i2c, PPI_LANEENABLE, + BIT(4) | BIT(3) | BIT(2) | BIT(1) | BIT(0)); + tc35876x_regw(i2c, DSI_LANEENABLE, + BIT(4) | BIT(3) | BIT(2) | BIT(1) | BIT(0)); + tc35876x_regw(i2c, PPI_STARTPPI, BIT(0)); + tc35876x_regw(i2c, DSI_STARTDSI, BIT(0)); + + /* Setting LVDS output frequency */ + tc35876x_regw(i2c, LVPHY0, FLD_VAL(1, 20, 16) | + FLD_VAL(2, 15, 14) | FLD_VAL(6, 4, 0)); /* 0x00048006 */ + + /* Setting video panel control register,0x00000120 VTGen=ON ?!?!? */ + tc35876x_regw(i2c, VPCTRL, BIT(8) | BIT(5)); + + /* Horizontal back porch and horizontal pulse width. 0x00280028 */ + tc35876x_regw(i2c, HTIM1, FLD_VAL(40, 24, 16) | FLD_VAL(40, 8, 0)); + + /* Horizontal front porch and horizontal active video size. 0x00500500*/ + tc35876x_regw(i2c, HTIM2, FLD_VAL(80, 24, 16) | FLD_VAL(1280, 10, 0)); + + /* Vertical back porch and vertical sync pulse width. 0x000e000a */ + tc35876x_regw(i2c, VTIM1, FLD_VAL(14, 23, 16) | FLD_VAL(10, 7, 0)); + + /* Vertical front porch and vertical display size. 0x000e0320 */ + tc35876x_regw(i2c, VTIM2, FLD_VAL(14, 23, 16) | FLD_VAL(800, 10, 0)); + + /* Set above HTIM1, HTIM2, VTIM1, and VTIM2 at next VSYNC. */ + tc35876x_regw(i2c, VFUEN, BIT(0)); + + /* Soft reset LCD controller. */ + tc35876x_regw(i2c, SYSRST, BIT(2)); + + /* LVDS-TX input muxing */ + tc35876x_regw(i2c, LVMX0003, + INPUT_MUX(INPUT_R5, INPUT_R4, INPUT_R3, INPUT_R2)); + tc35876x_regw(i2c, LVMX0407, + INPUT_MUX(INPUT_G2, INPUT_R7, INPUT_R1, INPUT_R6)); + tc35876x_regw(i2c, LVMX0811, + INPUT_MUX(INPUT_G1, INPUT_G0, INPUT_G4, INPUT_G3)); + tc35876x_regw(i2c, LVMX1215, + INPUT_MUX(INPUT_B2, INPUT_G7, INPUT_G6, INPUT_G5)); + tc35876x_regw(i2c, LVMX1619, + INPUT_MUX(INPUT_B4, INPUT_B3, INPUT_B1, INPUT_B0)); + tc35876x_regw(i2c, LVMX2023, + INPUT_MUX(LOGIC_0, INPUT_B7, INPUT_B6, INPUT_B5)); + tc35876x_regw(i2c, LVMX2427, + INPUT_MUX(INPUT_R0, INPUT_DE, INPUT_VSYNC, INPUT_HSYNC)); + + /* Enable LVDS transmitter. */ + tc35876x_regw(i2c, LVCFG, BIT(0)); + + /* Clear notifications. Don't write reserved bits. Was write 0xffffffff + * to 0x0288, must be in error?! */ + tc35876x_regw(i2c, DSI_INTCLR, FLD_MASK(31, 30) | FLD_MASK(22, 0)); +} + +#define GPIOPWMCTRL 0x38F +#define PWM0CLKDIV0 0x62 /* low byte */ +#define PWM0CLKDIV1 0x61 /* high byte */ + +#define SYSTEMCLK 19200000UL /* 19.2 MHz */ +#define PWM_FREQUENCY 9600 /* Hz */ + +/* f = baseclk / (clkdiv + 1) => clkdiv = (baseclk - f) / f */ +static inline u16 calc_clkdiv(unsigned long baseclk, unsigned int f) +{ + return (baseclk - f) / f; +} + +static void tc35876x_brightness_init(struct drm_device *dev) +{ + int ret; + u8 pwmctrl; + u16 clkdiv; + + /* Make sure the PWM reference is the 19.2 MHz system clock. Read first + * instead of setting directly to catch potential conflicts between PWM + * users. */ + ret = intel_scu_ipc_ioread8(GPIOPWMCTRL, &pwmctrl); + if (ret || pwmctrl != 0x01) { + if (ret) + dev_err(&dev->pdev->dev, "GPIOPWMCTRL read failed\n"); + else + dev_warn(&dev->pdev->dev, "GPIOPWMCTRL was not set to system clock (pwmctrl = 0x%02x)\n", pwmctrl); + + ret = intel_scu_ipc_iowrite8(GPIOPWMCTRL, 0x01); + if (ret) + dev_err(&dev->pdev->dev, "GPIOPWMCTRL set failed\n"); + } + + clkdiv = calc_clkdiv(SYSTEMCLK, PWM_FREQUENCY); + + ret = intel_scu_ipc_iowrite8(PWM0CLKDIV1, (clkdiv >> 8) & 0xff); + if (!ret) + ret = intel_scu_ipc_iowrite8(PWM0CLKDIV0, clkdiv & 0xff); + + if (ret) + dev_err(&dev->pdev->dev, "PWM0CLKDIV set failed\n"); + else + dev_dbg(&dev->pdev->dev, "PWM0CLKDIV set to 0x%04x (%d Hz)\n", + clkdiv, PWM_FREQUENCY); +} + +#define PWM0DUTYCYCLE 0x67 + +void tc35876x_brightness_control(struct drm_device *dev, int level) +{ + int ret; + u8 duty_val; + u8 panel_duty_val; + + level = clamp(level, 0, MDFLD_DSI_BRIGHTNESS_MAX_LEVEL); + + /* PWM duty cycle 0x00...0x63 corresponds to 0...99% */ + duty_val = level * 0x63 / MDFLD_DSI_BRIGHTNESS_MAX_LEVEL; + + /* I won't pretend to understand this formula. The panel spec is quite + * bad engrish. + */ + panel_duty_val = (2 * level - 100) * 0xA9 / + MDFLD_DSI_BRIGHTNESS_MAX_LEVEL + 0x56; + + ret = intel_scu_ipc_iowrite8(PWM0DUTYCYCLE, duty_val); + if (ret) + dev_err(&tc35876x_client->dev, "%s: ipc write fail\n", + __func__); + + if (cmi_lcd_i2c_client) { + ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client, + PANEL_PWM_MAX, panel_duty_val); + if (ret < 0) + dev_err(&cmi_lcd_i2c_client->dev, "%s: i2c write failed\n", + __func__); + } +} + +void tc35876x_toshiba_bridge_panel_off(struct drm_device *dev) +{ + struct tc35876x_platform_data *pdata; + + if (WARN(!tc35876x_client, "%s called before probe", __func__)) + return; + + dev_dbg(&tc35876x_client->dev, "%s\n", __func__); + + pdata = dev_get_platdata(&tc35876x_client->dev); + + if (pdata->gpio_panel_bl_en != -1) + gpio_set_value_cansleep(pdata->gpio_panel_bl_en, 0); + + if (pdata->gpio_panel_vadd != -1) + gpio_set_value_cansleep(pdata->gpio_panel_vadd, 0); +} + +void tc35876x_toshiba_bridge_panel_on(struct drm_device *dev) +{ + struct tc35876x_platform_data *pdata; + struct drm_psb_private *dev_priv = dev->dev_private; + + if (WARN(!tc35876x_client, "%s called before probe", __func__)) + return; + + dev_dbg(&tc35876x_client->dev, "%s\n", __func__); + + pdata = dev_get_platdata(&tc35876x_client->dev); + + if (pdata->gpio_panel_vadd != -1) { + gpio_set_value_cansleep(pdata->gpio_panel_vadd, 1); + msleep(260); + } + + if (cmi_lcd_i2c_client) { + int ret; + dev_dbg(&cmi_lcd_i2c_client->dev, "setting TCON\n"); + /* Bit 4 is average_saving. Setting it to 1, the brightness is + * referenced to the average of the frame content. 0 means + * reference to the maximum of frame contents. Bits 3:0 are + * allow_distort. When set to a nonzero value, all color values + * between 255-allow_distort*2 and 255 are mapped to the + * 255-allow_distort*2 value. + */ + ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client, + PANEL_ALLOW_DISTORT, 0x10); + if (ret < 0) + dev_err(&cmi_lcd_i2c_client->dev, + "i2c write failed (%d)\n", ret); + ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client, + PANEL_BYPASS_PWMI, 0); + if (ret < 0) + dev_err(&cmi_lcd_i2c_client->dev, + "i2c write failed (%d)\n", ret); + /* Set minimum brightness value - this is tunable */ + ret = i2c_smbus_write_byte_data(cmi_lcd_i2c_client, + PANEL_PWM_MIN, 0x35); + if (ret < 0) + dev_err(&cmi_lcd_i2c_client->dev, + "i2c write failed (%d)\n", ret); + } + + if (pdata->gpio_panel_bl_en != -1) + gpio_set_value_cansleep(pdata->gpio_panel_bl_en, 1); + + tc35876x_brightness_control(dev, dev_priv->brightness_adjusted); +} + +static struct drm_display_mode *tc35876x_get_config_mode(struct drm_device *dev) +{ + struct drm_display_mode *mode; + + dev_dbg(&dev->pdev->dev, "%s\n", __func__); + + mode = kzalloc(sizeof(*mode), GFP_KERNEL); + if (!mode) + return NULL; + + /* FIXME: do this properly. */ + mode->hdisplay = 1280; + mode->vdisplay = 800; + mode->hsync_start = 1360; + mode->hsync_end = 1400; + mode->htotal = 1440; + mode->vsync_start = 814; + mode->vsync_end = 824; + mode->vtotal = 838; + mode->clock = 33324 << 1; + + dev_info(&dev->pdev->dev, "hdisplay(w) = %d\n", mode->hdisplay); + dev_info(&dev->pdev->dev, "vdisplay(h) = %d\n", mode->vdisplay); + dev_info(&dev->pdev->dev, "HSS = %d\n", mode->hsync_start); + dev_info(&dev->pdev->dev, "HSE = %d\n", mode->hsync_end); + dev_info(&dev->pdev->dev, "htotal = %d\n", mode->htotal); + dev_info(&dev->pdev->dev, "VSS = %d\n", mode->vsync_start); + dev_info(&dev->pdev->dev, "VSE = %d\n", mode->vsync_end); + dev_info(&dev->pdev->dev, "vtotal = %d\n", mode->vtotal); + dev_info(&dev->pdev->dev, "clock = %d\n", mode->clock); + + drm_mode_set_name(mode); + drm_mode_set_crtcinfo(mode, 0); + + mode->type |= DRM_MODE_TYPE_PREFERRED; + + return mode; +} + +/* DV1 Active area 216.96 x 135.6 mm */ +#define DV1_PANEL_WIDTH 217 +#define DV1_PANEL_HEIGHT 136 + +static int tc35876x_get_panel_info(struct drm_device *dev, int pipe, + struct panel_info *pi) +{ + if (!dev || !pi) + return -EINVAL; + + pi->width_mm = DV1_PANEL_WIDTH; + pi->height_mm = DV1_PANEL_HEIGHT; + + return 0; +} + +static int tc35876x_bridge_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct tc35876x_platform_data *pdata; + + dev_info(&client->dev, "%s\n", __func__); + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, "%s: i2c_check_functionality() failed\n", + __func__); + return -ENODEV; + } + + pdata = dev_get_platdata(&client->dev); + if (!pdata) { + dev_err(&client->dev, "%s: no platform data\n", __func__); + return -ENODEV; + } + + if (pdata->gpio_bridge_reset != -1) { + gpio_request(pdata->gpio_bridge_reset, "tc35876x bridge reset"); + gpio_direction_output(pdata->gpio_bridge_reset, 0); + } + + if (pdata->gpio_panel_bl_en != -1) { + gpio_request(pdata->gpio_panel_bl_en, "tc35876x panel bl en"); + gpio_direction_output(pdata->gpio_panel_bl_en, 0); + } + + if (pdata->gpio_panel_vadd != -1) { + gpio_request(pdata->gpio_panel_vadd, "tc35876x panel vadd"); + gpio_direction_output(pdata->gpio_panel_vadd, 0); + } + + tc35876x_client = client; + + return 0; +} + +static int tc35876x_bridge_remove(struct i2c_client *client) +{ + struct tc35876x_platform_data *pdata = dev_get_platdata(&client->dev); + + dev_dbg(&client->dev, "%s\n", __func__); + + if (pdata->gpio_bridge_reset != -1) + gpio_free(pdata->gpio_bridge_reset); + + if (pdata->gpio_panel_bl_en != -1) + gpio_free(pdata->gpio_panel_bl_en); + + if (pdata->gpio_panel_vadd != -1) + gpio_free(pdata->gpio_panel_vadd); + + tc35876x_client = NULL; + + return 0; +} + +static const struct i2c_device_id tc35876x_bridge_id[] = { + { "i2c_disp_brig", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, tc35876x_bridge_id); + +static struct i2c_driver tc35876x_bridge_i2c_driver = { + .driver = { + .name = "i2c_disp_brig", + }, + .id_table = tc35876x_bridge_id, + .probe = tc35876x_bridge_probe, + .remove = __devexit_p(tc35876x_bridge_remove), +}; + +/* LCD panel I2C */ +static int cmi_lcd_i2c_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + dev_info(&client->dev, "%s\n", __func__); + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, "%s: i2c_check_functionality() failed\n", + __func__); + return -ENODEV; + } + + cmi_lcd_i2c_client = client; + + return 0; +} + +static int cmi_lcd_i2c_remove(struct i2c_client *client) +{ + dev_dbg(&client->dev, "%s\n", __func__); + + cmi_lcd_i2c_client = NULL; + + return 0; +} + +static const struct i2c_device_id cmi_lcd_i2c_id[] = { + { "cmi-lcd", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, cmi_lcd_i2c_id); + +static struct i2c_driver cmi_lcd_i2c_driver = { + .driver = { + .name = "cmi-lcd", + }, + .id_table = cmi_lcd_i2c_id, + .probe = cmi_lcd_i2c_probe, + .remove = __devexit_p(cmi_lcd_i2c_remove), +}; + +/* HACK to create I2C device while it's not created by platform code */ +#define CMI_LCD_I2C_ADAPTER 2 +#define CMI_LCD_I2C_ADDR 0x60 + +static int cmi_lcd_hack_create_device(void) +{ + struct i2c_adapter *adapter; + struct i2c_client *client; + struct i2c_board_info info = { + .type = "cmi-lcd", + .addr = CMI_LCD_I2C_ADDR, + }; + + pr_debug("%s\n", __func__); + + adapter = i2c_get_adapter(CMI_LCD_I2C_ADAPTER); + if (!adapter) { + pr_err("%s: i2c_get_adapter(%d) failed\n", __func__, + CMI_LCD_I2C_ADAPTER); + return -EINVAL; + } + + client = i2c_new_device(adapter, &info); + if (!client) { + pr_err("%s: i2c_new_device() failed\n", __func__); + i2c_put_adapter(adapter); + return -EINVAL; + } + + return 0; +} + +static const struct drm_encoder_helper_funcs tc35876x_encoder_helper_funcs = { + .dpms = mdfld_dsi_dpi_dpms, + .mode_fixup = mdfld_dsi_dpi_mode_fixup, + .prepare = mdfld_dsi_dpi_prepare, + .mode_set = mdfld_dsi_dpi_mode_set, + .commit = mdfld_dsi_dpi_commit, +}; + +static const struct drm_encoder_funcs tc35876x_encoder_funcs = { + .destroy = drm_encoder_cleanup, +}; + +const struct panel_funcs mdfld_tc35876x_funcs = { + .encoder_funcs = &tc35876x_encoder_funcs, + .encoder_helper_funcs = &tc35876x_encoder_helper_funcs, + .get_config_mode = tc35876x_get_config_mode, + .get_panel_info = tc35876x_get_panel_info, +}; + +void tc35876x_init(struct drm_device *dev) +{ + int r; + + dev_dbg(&dev->pdev->dev, "%s\n", __func__); + + cmi_lcd_hack_create_device(); + + r = i2c_add_driver(&cmi_lcd_i2c_driver); + if (r < 0) + dev_err(&dev->pdev->dev, + "%s: i2c_add_driver() for %s failed (%d)\n", + __func__, cmi_lcd_i2c_driver.driver.name, r); + + r = i2c_add_driver(&tc35876x_bridge_i2c_driver); + if (r < 0) + dev_err(&dev->pdev->dev, + "%s: i2c_add_driver() for %s failed (%d)\n", + __func__, tc35876x_bridge_i2c_driver.driver.name, r); + + tc35876x_brightness_init(dev); +} + +void tc35876x_exit(void) +{ + pr_debug("%s\n", __func__); + + i2c_del_driver(&tc35876x_bridge_i2c_driver); + + if (cmi_lcd_i2c_client) + i2c_del_driver(&cmi_lcd_i2c_driver); +} diff --git a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h new file mode 100644 index 000000000000..b14b7f9e7d1e --- /dev/null +++ b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.h @@ -0,0 +1,38 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __MDFLD_DSI_LVDS_BRIDGE_H__ +#define __MDFLD_DSI_LVDS_BRIDGE_H__ + +void tc35876x_set_bridge_reset_state(struct drm_device *dev, int state); +void tc35876x_configure_lvds_bridge(struct drm_device *dev); +void tc35876x_brightness_control(struct drm_device *dev, int level); +void tc35876x_toshiba_bridge_panel_off(struct drm_device *dev); +void tc35876x_toshiba_bridge_panel_on(struct drm_device *dev); +void tc35876x_init(struct drm_device *dev); +void tc35876x_exit(void); + +extern const struct panel_funcs mdfld_tc35876x_funcs; + +#endif /*__MDFLD_DSI_LVDS_BRIDGE_H__*/ diff --git a/include/linux/i2c/tc35876x.h b/include/linux/i2c/tc35876x.h new file mode 100644 index 000000000000..cd6a51c71e7e --- /dev/null +++ b/include/linux/i2c/tc35876x.h @@ -0,0 +1,11 @@ + +#ifndef _TC35876X_H +#define _TC35876X_H + +struct tc35876x_platform_data { + int gpio_bridge_reset; + int gpio_panel_bl_en; + int gpio_panel_vadd; +}; + +#endif /* _TC35876X_H */ -- cgit v1.2.3 From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 17:09:22 +0100 Subject: sched: Cleanup cpu_active madness Stepan found: CPU0 CPUn _cpu_up() __cpu_up() boostrap() notify_cpu_starting() set_cpu_online() while (!cpu_active()) cpu_relax() smp_call_function(.wait=1) /* we find cpu_online() is true */ arch_send_call_function_ipi_mask() /* wait-forever-more */ local_irq_enable() cpu_notify(CPU_ONLINE) sched_cpu_active() set_cpu_active() Now the purpose of cpu_active is mostly with bringing down a cpu, where we mark it !active to avoid the load-balancer from moving tasks to it while we tear down the cpu. This is required because we only update the sched_domain tree after we brought the cpu-down. And this is needed so that some tasks can still run while we bring it down, we just don't want new tasks to appear. On cpu-up however the sched_domain tree doesn't yet include the new cpu, so its invisible to the load-balancer, regardless of the active state. So instead of setting the active state after we boot the new cpu (and consequently having to wait for it before enabling interrupts) set the cpu active before we set it online and avoid the whole mess. Reported-by: Stepan Moskovchenko Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins Signed-off-by: Ingo Molnar --- arch/arm/kernel/smp.c | 7 ------- arch/hexagon/kernel/smp.c | 2 -- arch/s390/kernel/smp.c | 6 ------ arch/x86/kernel/smpboot.c | 13 ------------- kernel/sched/core.c | 2 +- 5 files changed, 1 insertion(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index cdeb727527d3..d616ed51e7a7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); - while (!cpu_active(cpu)) - cpu_relax(); - - /* - * cpu_active bit is set, so it's safe to enalbe interrupts - * now. - */ local_irq_enable(); local_fiq_enable(); diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index c871a2cffaef..0123c63e9a3a 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -179,8 +179,6 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); - while (!cpumask_test_cpu(cpu, cpu_active_mask)) - cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2398ce6b15ae..b0e28c47ab83 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ - /* - * Wait until the cpu which brought this one up marked it - * active before enabling interrupts. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); - /* - * Wait until the cpu which brought this one up marked it - * online before enabling interrupts. If we don't do that then - * we can end up waking up the softirq thread before this cpu - * reached the active state, which makes the scheduler unhappy - * and schedule the softirq thread on the wrong cpu. This is - * only observable with forced threaded interrupts, but in - * theory it could also happen w/o them. It's just way harder - * to achieve. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); - /* enable local interrupts */ local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95545126be1c..b1ccce819ce2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From 87e24f4b67e68d9fd8df16e0bf9c66d1ad2a2533 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Mar 2012 23:59:25 +0100 Subject: perf/x86: Fix local vs remote memory events for NHM/WSM Verified using the below proglet.. before: [root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0 remote write Performance counter stats for './numa 0': 2,101,554 node-stores 2,096,931 node-store-misses 5.021546079 seconds time elapsed [root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1 local write Performance counter stats for './numa 1': 501,137 node-stores 199 node-store-misses 5.124451068 seconds time elapsed After: [root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 0 remote write Performance counter stats for './numa 0': 2,107,516 node-stores 2,097,187 node-store-misses 5.012755149 seconds time elapsed [root@westmere ~]# perf stat -e node-stores -e node-store-misses ./numa 1 local write Performance counter stats for './numa 1': 2,063,355 node-stores 165 node-store-misses 5.082091494 seconds time elapsed #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #define SIZE (32*1024*1024) volatile int done; void sig_done(int sig) { done = 1; } int main(int argc, char **argv) { cpu_set_t *mask, *mask2; size_t size; int i, err, t; int nrcpus = 1024; char *mem; unsigned long nodemask = 0x01; /* node 0 */ DIR *node; struct dirent *de; int read = 0; int local = 0; if (argc < 2) { printf("usage: %s [0-3]\n", argv[0]); printf(" bit0 - local/remote\n"); printf(" bit1 - read/write\n"); exit(0); } switch (atoi(argv[1])) { case 0: printf("remote write\n"); break; case 1: printf("local write\n"); local = 1; break; case 2: printf("remote read\n"); read = 1; break; case 3: printf("local read\n"); local = 1; read = 1; break; } mask = CPU_ALLOC(nrcpus); size = CPU_ALLOC_SIZE(nrcpus); CPU_ZERO_S(size, mask); node = opendir("/sys/devices/system/node/node0/"); if (!node) perror("opendir"); while ((de = readdir(node))) { int cpu; if (sscanf(de->d_name, "cpu%d", &cpu) == 1) CPU_SET_S(cpu, size, mask); } closedir(node); mask2 = CPU_ALLOC(nrcpus); CPU_ZERO_S(size, mask2); for (i = 0; i < size; i++) CPU_SET_S(i, size, mask2); CPU_XOR_S(size, mask2, mask2, mask); // invert if (!local) mask = mask2; err = sched_setaffinity(0, size, mask); if (err) perror("sched_setaffinity"); mem = mmap(0, SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); err = mbind(mem, SIZE, MPOL_BIND, &nodemask, 8*sizeof(nodemask), MPOL_MF_MOVE); if (err) perror("mbind"); signal(SIGALRM, sig_done); alarm(5); if (!read) { while (!done) { for (i = 0; i < SIZE; i++) mem[i] = 0x01; } } else { while (!done) { for (i = 0; i < SIZE; i++) t += *(volatile char *)(mem + i); } } return 0; } Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Link: http://lkml.kernel.org/n/tip-tq73sxus35xmqpojf7ootxgs@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..61d4f79a550e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -385,14 +385,15 @@ static __initconst const u64 westmere_hw_cache_event_ids #define NHM_LOCAL_DRAM (1 << 14) #define NHM_NON_DRAM (1 << 15) -#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) +#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_REMOTE (NHM_REMOTE_DRAM) #define NHM_DMND_READ (NHM_DMND_DATA_RD) #define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) #define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) #define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) -#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) #define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) static __initconst const u64 nehalem_hw_cache_extra_regs @@ -416,16 +417,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs }, [ C(NODE) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, - [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, - [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, - [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, }, }, }; -- cgit v1.2.3 From f9b4eeb809c6d031cc9561cc34dd691701cb2c2a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Mar 2012 12:44:35 +0100 Subject: perf/x86: Prettify pmu config literals I got somewhat tired of having to decode hex numbers.. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Stephane Eranian Cc: Robert Richter Link: http://lkml.kernel.org/n/tip-0vsy1sgywc4uar3mu1szm0rg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 23 +++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 21 ++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 82db83b5c3bc..66fda0c26402 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -268,6 +268,29 @@ struct x86_pmu_quirk { void (*func)(void); }; +union x86_pmu_config { + struct { + u64 event:8, + umask:8, + usr:1, + os:1, + edge:1, + pc:1, + interrupt:1, + __reserved1:1, + en:1, + inv:1, + cmask:8, + event2:4, + __reserved2:4, + go:1, + ho:1; + } bits; + u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value + /* * struct x86_pmu - generic x86 pmu */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 61d4f79a550e..4bd9c9ef9d42 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1288,7 +1288,8 @@ static int intel_pmu_hw_config(struct perf_event *event) * * Thereby we gain a PEBS capable cycle counter. */ - u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; @@ -1690,9 +1691,11 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_nehalem_extra_regs; /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); x86_add_quirk(intel_nehalem_quirk); @@ -1727,9 +1730,11 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); pr_cont("Westmere events, "); break; @@ -1750,9 +1755,11 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); pr_cont("SandyBridge events, "); break; -- cgit v1.2.3 From 73d63d038ee9f769f5e5b46792d227fe20e442c5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 12 Mar 2012 11:36:33 -0700 Subject: x86/ioapic: Add register level checks to detect bogus io-apic entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the recent changes to clear_IO_APIC_pin() which tries to clear remoteIRR bit explicitly, some of the users started to see "Unable to reset IRR for apic .." messages. Close look shows that these are related to bogus IO-APIC entries which return's all 1's for their io-apic registers. And the above mentioned error messages are benign. But kernel should have ignored such io-apic's in the first place. Check if register 0, 1, 2 of the listed io-apic are all 1's and ignore such io-apic. Reported-by: Álvaro Castillo Tested-by: Jon Dufresne Signed-off-by: Suresh Siddha Cc: yinghai@kernel.org Cc: kernel-team@fedoraproject.org Cc: Josh Boyer Cc: Link: http://lkml.kernel.org/r/1331577393.31585.94.camel@sbsiddha-desk.sc.intel.com [ Performed minor cleanup of affected code. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fb072754bc1d..6d10a66fc5a9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3967,18 +3967,36 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " - "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, nr_ioapics); return 1; } if (!address) { - printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); + pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); return 1; } return 0; } +static __init int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; @@ -3995,6 +4013,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return; + } + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); ioapics[idx].mp_config.apicver = io_apic_get_version(idx); @@ -4015,10 +4039,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) if (gsi_cfg->gsi_end >= gsi_top) gsi_top = gsi_cfg->gsi_end + 1; - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } -- cgit v1.2.3 From 51e7dc7011c99e1e5294658c7b551b92ca069985 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 12 Mar 2012 14:55:55 +0530 Subject: x86: Rename trap_no to trap_nr in thread_struct There are precedences of trap number being referred to as trap_nr. However thread struct refers trap number as trap_no. Change it to trap_nr. Also use enum instead of left-over literals for trap values. This is pure cleanup, no functional change intended. Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120312092555.5379.942.sendpatchset@srdronam.in.ibm.com [ Fixed the math-emu build ] Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/ptrace.c | 3 ++- arch/x86/kernel/signal.c | 2 +- arch/x86/kernel/traps.c | 16 ++++++++-------- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/math-emu/fpu_entry.c | 5 +++-- arch/x86/mm/fault.c | 10 +++++----- 10 files changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bc09ed2a8b97..45b4fdd4e1da 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -345,7 +345,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, put_user_ex(regs->dx, &sc->dx); put_user_ex(regs->cx, &sc->cx); put_user_ex(regs->ax, &sc->ax); - put_user_ex(current->thread.trap_no, &sc->trapno); + put_user_ex(current->thread.trap_nr, &sc->trapno); put_user_ex(current->thread.error_code, &sc->err); put_user_ex(regs->ip, &sc->ip); put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 02ce0b379647..f6d0d2eb0832 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -453,7 +453,7 @@ struct thread_struct { unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; - unsigned long trap_no; + unsigned long trap_nr; unsigned long error_code; /* floating point and extended processor state */ struct fpu fpu; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4025fe4f928f..28f98706b08b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,7 +265,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; show_registers(regs); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 93e7877a19c4..6fb330adc7c7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "tls.h" @@ -1425,7 +1426,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, int error_code, int si_code, struct siginfo *info) { - tsk->thread.trap_no = 1; + tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; memset(info, 0, sizeof(*info)); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c3846b6fb726..9c73acc1c860 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -150,7 +150,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->r15, &sc->r15); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_no, &sc->trapno); + put_user_ex(current->thread.trap_nr, &sc->trapno); put_user_ex(current->thread.error_code, &sc->err); put_user_ex(regs->ip, &sc->ip); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 037fc2bc5316..c6d17ad59b8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -132,7 +132,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, trap_signal: #endif /* - * We want error_code and trap_no set for userspace faults and + * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the @@ -141,7 +141,7 @@ trap_signal: * delivered, faults. See also do_general_protection below. */ tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && @@ -164,7 +164,7 @@ trap_signal: kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; + tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -240,7 +240,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_DF; + tsk->thread.trap_nr = X86_TRAP_DF; /* * This is always a kernel trap and never fixable (and thus must @@ -268,7 +268,7 @@ do_general_protection(struct pt_regs *regs, long error_code) goto gp_in_kernel; tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_GP; + tsk->thread.trap_nr = X86_TRAP_GP; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { @@ -295,7 +295,7 @@ gp_in_kernel: return; tsk->thread.error_code = error_code; - tsk->thread.trap_no = X86_TRAP_GP; + tsk->thread.trap_nr = X86_TRAP_GP; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) return; @@ -475,7 +475,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; die(str, regs, error_code); } return; @@ -485,7 +485,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) * Save the info for the exception handler and clear the error. */ save_init_fpu(task); - task->thread.trap_no = trapnr; + task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..a1315ab2d6b9 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -567,7 +567,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) } if (trapno != 1) return 1; /* we let this handle by the calling routine */ - current->thread.trap_no = trapno; + current->thread.trap_nr = trapno; current->thread.error_code = error_code; force_sig(SIGTRAP, current); return 0; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba9393564..327509b95e0e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -153,7 +153,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->error_code = 6; /* user fault, no page, write */ thread->cr2 = ptr; - thread->trap_no = 14; + thread->trap_nr = X86_TRAP_PF; memset(&info, 0, sizeof(info)); info.si_signo = SIGSEGV; diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 7718541541d4..9b868124128d 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -269,7 +270,7 @@ void math_emulate(struct math_emu_info *info) FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ RE_ENTRANT_CHECK_OFF; - current->thread.trap_no = 16; + current->thread.trap_nr = X86_TRAP_MF; current->thread.error_code = 0; send_sig(SIGFPE, current, 1); return; @@ -662,7 +663,7 @@ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, void math_abort(struct math_emu_info *info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; - current->thread.trap_no = 16; + current->thread.trap_nr = X86_TRAP_MF; current->thread.error_code = 0; send_sig(signal, current, 1); RE_ENTRANT_CHECK_OFF; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f0b4caf85c1a..3ecfd1aaf214 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -615,7 +615,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) @@ -636,7 +636,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { if (current_thread_info()->sig_on_uaccess_error && signal) { - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; @@ -676,7 +676,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; sig = SIGKILL; @@ -754,7 +754,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, /* Kernel addresses are always protection faults: */ tsk->thread.cr2 = address; tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); @@ -838,7 +838,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.cr2 = address; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { -- cgit v1.2.3 From 9993bc635d01a6ee7f6b833b4ee65ce7c06350b1 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Fri, 9 Mar 2012 16:41:01 -0800 Subject: sched/x86: Fix overflow in cyc2ns_offset When a machine boots up, the TSC generally gets reset. However, when kexec is used to boot into a kernel, the TSC value would be carried over from the previous kernel. The computation of cycns_offset in set_cyc2ns_scale is prone to an overflow, if the machine has been up more than 208 days prior to the kexec. The overflow happens when we multiply *scale, even though there is enough room to store the final answer. We fix this issue by decomposing tsc_now into the quotient and remainder of division by CYC2NS_SCALE_FACTOR and then performing the multiplication separately on the two components. Refactor code to share the calculation with the previous fix in __cycles_2_ns(). Signed-off-by: Salman Qazi Acked-by: John Stultz Acked-by: Peter Zijlstra Cc: Paul Turner Cc: john stultz Link: http://lkml.kernel.org/r/20120310004027.19291.88460.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 8 ++------ arch/x86/kernel/tsc.c | 3 ++- include/linux/kernel.h | 13 +++++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 431793e5d484..34baa0eb5d0c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -57,14 +57,10 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - unsigned long long quot; - unsigned long long rem; int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - quot = (cyc >> CYC2NS_SCALE_FACTOR); - rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1); - ns += quot * per_cpu(cyc2ns, cpu) + - ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR); + ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), + (1UL << CYC2NS_SCALE_FACTOR)); return ns; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..183c5925a9fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); } sched_clock_idle_wakeup_event(0); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8343422240a..d801acb5e680 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,6 +85,19 @@ } \ ) +/* + * Multiplies an integer by a fraction, while avoiding unnecessary + * overflow or loss of precision. + */ +#define mult_frac(x, numer, denom)( \ +{ \ + typeof(x) quot = (x) / (denom); \ + typeof(x) rem = (x) % (denom); \ + (quot * (numer)) + ((rem * (numer)) / (denom)); \ +} \ +) + + #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) -- cgit v1.2.3 From ef334a20d84f52407a8a2afd02ddeaecbef0ad3d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 13 Mar 2012 19:33:03 +0530 Subject: x86: Move is_ia32_task to asm/thread_info.h from asm/compat.h is_ia32_task() is useful even in !CONFIG_COMPAT cases - utrace will use it for example. Hence move it to a more generic file: asm/thread_info.h Also now is_ia32_task() returns true if CONFIG_X86_32 is defined. Signed-off-by: Srikar Dronamraju Acked-by: H. Peter Anvin Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120313140303.17134.1401.sendpatchset@srdronam.in.ibm.com [ Performed minor cleanup ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 9 --------- arch/x86/include/asm/thread_info.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 355edc091604..d6805798d6fc 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -235,15 +235,6 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)round_down(sp - len, 16); } -static inline bool is_ia32_task(void) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & TS_COMPAT) - return true; -#endif - return false; -} - static inline bool is_x32_task(void) { #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index af1db7e722f4..ad6df8ccd715 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -266,6 +266,18 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); } + +static inline bool is_ia32_task(void) +{ +#ifdef CONFIG_X86_32 + return true; +#endif +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & TS_COMPAT) + return true; +#endif + return false; +} #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ -- cgit v1.2.3 From 09f98a825a821f7a3f1b162f9ed023f37213a63b Mon Sep 17 00:00:00 2001 From: Tang Liang Date: Fri, 9 Dec 2011 10:05:54 +0800 Subject: x86, acpi, tboot: Have a ACPI os prepare sleep instead of calling tboot_sleep. The ACPI suspend path makes a call to tboot_sleep right before it writes the PM1A, PM1B values. We replace the direct call to tboot via an registration callback similar to __acpi_register_gsi. CC: Len Brown Acked-by: Joseph Cihula Acked-by: Rafael J. Wysocki [v1: Added __attribute__ ((unused))] [v2: Introduced a wrapper instead of changing tboot_sleep return values] [v3: Added return value AE_CTRL_SKIP for acpi_os_sleep_prepare] Signed-off-by: Tang Liang [v1: Fix compile issues on IA64 and PPC64] [v2: Fix where __acpi_os_prepare_sleep==NULL and did not go in sleep properly] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/tboot.c | 8 ++++++++ drivers/acpi/acpica/hwsleep.c | 10 +++++++--- drivers/acpi/osl.c | 24 ++++++++++++++++++++++++ include/acpi/acexcep.h | 1 + include/linux/acpi.h | 10 ++++++++++ include/linux/tboot.h | 1 - 6 files changed, 50 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e2410e27f97e..1a4ab7df5b63 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -297,6 +297,12 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) tboot_shutdown(acpi_shutdown_map[sleep_state]); } +static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control, + u32 pm1b_control) +{ + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + return 0; +} static atomic_t ap_wfs_count; @@ -345,6 +351,8 @@ static __init int tboot_late_init(void) atomic_set(&ap_wfs_count, 0); register_hotcpu_notifier(&tboot_cpu_notifier); + + acpi_os_set_prepare_sleep(&tboot_sleep_wrapper); return 0; } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index d52da3073650..992359af7e2f 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -43,9 +43,9 @@ */ #include +#include #include "accommon.h" #include "actables.h" -#include #include #define _COMPONENT ACPI_HARDWARE @@ -344,8 +344,12 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); - tboot_sleep(sleep_state, pm1a_control, pm1b_control); - + status = acpi_os_prepare_sleep(sleep_state, pm1a_control, + pm1b_control); + if (ACPI_SKIP(status)) + return_ACPI_STATUS(AE_OK); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(status); /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index f31c5c5f1b7e..f3aae4ba507e 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -76,6 +76,9 @@ EXPORT_SYMBOL(acpi_in_debugger); extern char line_buf[80]; #endif /*ENABLE_DEBUGGER */ +static int (*__acpi_os_prepare_sleep)(u8 sleep_state, u32 pm1a_ctrl, + u32 pm1b_ctrl); + static acpi_osd_handler acpi_irq_handler; static void *acpi_irq_context; static struct workqueue_struct *kacpid_wq; @@ -1659,3 +1662,24 @@ acpi_status acpi_os_terminate(void) return AE_OK; } + +acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control, + u32 pm1b_control) +{ + int rc = 0; + if (__acpi_os_prepare_sleep) + rc = __acpi_os_prepare_sleep(sleep_state, + pm1a_control, pm1b_control); + if (rc < 0) + return AE_ERROR; + else if (rc > 0) + return AE_CTRL_SKIP; + + return AE_OK; +} + +void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, + u32 pm1a_ctrl, u32 pm1b_ctrl)) +{ + __acpi_os_prepare_sleep = func; +} diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 5b6c391efc8e..fa0d22ce089e 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -57,6 +57,7 @@ #define ACPI_SUCCESS(a) (!(a)) #define ACPI_FAILURE(a) (a) +#define ACPI_SKIP(a) (a == AE_CTRL_SKIP) #define AE_OK (acpi_status) 0x0000 /* diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6001b4da39dd..fccd017b8b6e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -359,4 +359,14 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b) } #endif +#ifdef CONFIG_ACPI +void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, + u32 pm1a_ctrl, u32 pm1b_ctrl)); + +acpi_status acpi_os_prepare_sleep(u8 sleep_state, + u32 pm1a_control, u32 pm1b_control); +#else +#define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0) +#endif + #endif /*_LINUX_ACPI_H*/ diff --git a/include/linux/tboot.h b/include/linux/tboot.h index 1dba6ee55203..c75128bed5fa 100644 --- a/include/linux/tboot.h +++ b/include/linux/tboot.h @@ -143,7 +143,6 @@ static inline int tboot_enabled(void) extern void tboot_probe(void); extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); extern struct acpi_table_header *tboot_get_dmar_table( struct acpi_table_header *dmar_tbl); extern int tboot_force_iommu(void); -- cgit v1.2.3 From a1f37788a6d8c037e7d92fe4a0fe9ec0d713b21e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 8 Dec 2011 17:14:08 +0800 Subject: tboot: Add return values for tboot_sleep .. as appropiately. As tboot_sleep now returns values. remove tboot_sleep_wrapper. Suggested-and-Acked-by: Rafael J. Wysocki Acked-by: Joseph Cihula [v1: Return -1/0/+1 instead of ACPI_xx values] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/tboot.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 1a4ab7df5b63..6410744ac5cb 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -272,7 +272,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) offsetof(struct acpi_table_facs, firmware_waking_vector); } -void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) { static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { /* S0,1,2: */ -1, -1, -1, @@ -281,7 +281,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) /* S5: */ TB_SHUTDOWN_S5 }; if (!tboot_enabled()) - return; + return 0; tboot_copy_fadt(&acpi_gbl_FADT); tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; @@ -292,15 +292,10 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { pr_warning("unsupported sleep state 0x%x\n", sleep_state); - return; + return -1; } tboot_shutdown(acpi_shutdown_map[sleep_state]); -} -static int tboot_sleep_wrapper(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ - tboot_sleep(sleep_state, pm1a_control, pm1b_control); return 0; } @@ -352,7 +347,7 @@ static __init int tboot_late_init(void) atomic_set(&ap_wfs_count, 0); register_hotcpu_notifier(&tboot_cpu_notifier); - acpi_os_set_prepare_sleep(&tboot_sleep_wrapper); + acpi_os_set_prepare_sleep(&tboot_sleep); return 0; } -- cgit v1.2.3 From bb6fa8b275e132b1e9319dbab94211543a0b7bd3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Mar 2012 22:44:41 -0700 Subject: x32: Fix stupid ia32/x32 inversion in the siginfo format Fix a stray ! which flipped the sense if we were generating a signal frame for ia32 vs. x32. Introduced in: e7084fd5 x32: Switch to a 64-bit clock_t Reported-by: H. J. Lu Signed-off-by: H. Peter Anvin Cc: Gregory M. Lueck Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/ia32/ia32_signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bc09ed2a8b97..ef026aa19d63 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -37,7 +37,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; - bool ia32 = !is_ia32_task(); + bool ia32 = is_ia32_task(); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; -- cgit v1.2.3 From fa63030e9c79e37b4d4e63b39ffb09cfb7aa0fe4 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 14 Mar 2012 15:17:34 +0800 Subject: x86/platform: Move APIC ID validity check into platform APIC code Move APIC ID validity check into platform APIC code, so it can be overridden when needed. For NumaChip systems, always trust MADT, as it's constructed with high APIC IDs. Behaviour verifies on standard x86 systems and on NumaChip systems with this, and compile-tested with allyesconfig. Signed-off-by: Daniel J Blueman Reviewed-by: Steffen Persvold Cc: Yinghai Lu Cc: H. Peter Anvin Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1331709454-27966-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++++++ arch/x86/kernel/apic/apic_flat_64.c | 2 ++ arch/x86/kernel/apic/apic_noop.c | 1 + arch/x86/kernel/apic/apic_numachip.c | 10 +++++++++- arch/x86/kernel/apic/bigsmp_32.c | 1 + arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/apic/probe_32.c | 1 + arch/x86/kernel/apic/summit_32.c | 1 + arch/x86/kernel/apic/x2apic_cluster.c | 1 + arch/x86/kernel/apic/x2apic_phys.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/smpboot.c | 2 +- 13 files changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3ab9bdd87e79..a9371c91718c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -288,6 +288,7 @@ struct apic { int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + int (*apic_id_valid)(int apicid); int (*apic_id_registered)(void); u32 irq_delivery_mode; @@ -532,6 +533,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +static inline int default_apic_id_valid(int apicid) +{ + return x2apic_mode || (apicid < 255); +} + extern void default_setup_apic_routing(void); extern struct apic apic_noop; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2b..359b6899a36c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -180,6 +180,7 @@ static struct apic apic_flat = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -337,6 +338,7 @@ static struct apic apic_physflat = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82bc655c..634ae6cdd5c9 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -124,6 +124,7 @@ struct apic apic_noop = { .probe = noop_probe, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c1cd99..d9ea5f331ac5 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void) return get_apic_id(apic_read(APIC_ID)); } +static int numachip_apic_id_valid(int apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + static int numachip_apic_id_registered(void) { return physid_isset(read_xapic_id(), phys_cpu_present_map); @@ -223,10 +229,11 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; + setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } @@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = { .name = "NumaConnect system", .probe = numachip_probe, .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead01137..0cdec7065aff 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -198,6 +198,7 @@ static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc47b6b..e42d1d3b9134 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca1349a..00d2422ca7c9 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = numaq_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3412f4..ff2c1b9aac4d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -92,6 +92,7 @@ static struct apic apic_default = { .name = "default", .probe = probe_default, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 19114423c58c..fea000b27f07 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -496,6 +496,7 @@ static struct apic apic_summit = { .name = "summit", .probe = probe_summit, .acpi_madt_oem_check = summit_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = summit_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 500795875827..9193713060a9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373dfde21e..bcd1db6eaca9 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b88aa19..fc4771425852 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..d279e6e1d1b7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -847,7 +847,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || - (!x2apic_mode && apicid >= 255)) { + !apic->apic_id_valid(apicid)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } -- cgit v1.2.3 From 0b95ec56ae19f61ca664e83766a2180057f0e351 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 5 Mar 2012 20:26:47 +0200 Subject: crypto: camellia - add assembler implementation for x86_64 Patch adds x86_64 assembler implementation of Camellia block cipher. Two set of functions are provided. First set is regular 'one-block at time' encrypt/decrypt functions. Second is 'two-block at time' functions that gain performance increase on out-of-order CPUs. Performance of 2-way functions should be equal to 1-way functions with in-order CPUs. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: AMD Phenom II 1055T (fam:16, model:10): camellia-asm vs camellia_generic: 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.27x 1.22x 1.30x 1.42x 1.30x 1.34x 1.19x 1.05x 1.23x 1.24x 64B 1.74x 1.79x 1.43x 1.87x 1.81x 1.87x 1.48x 1.38x 1.55x 1.62x 256B 1.90x 1.87x 1.43x 1.94x 1.94x 1.95x 1.63x 1.62x 1.67x 1.70x 1024B 1.96x 1.93x 1.43x 1.95x 1.98x 2.01x 1.67x 1.69x 1.74x 1.80x 8192B 1.96x 1.96x 1.39x 1.93x 2.01x 2.03x 1.72x 1.64x 1.71x 1.76x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.23x 1.23x 1.33x 1.39x 1.34x 1.38x 1.04x 1.18x 1.21x 1.29x 64B 1.72x 1.69x 1.42x 1.78x 1.81x 1.89x 1.57x 1.52x 1.56x 1.65x 256B 1.85x 1.88x 1.42x 1.86x 1.93x 1.96x 1.69x 1.65x 1.70x 1.75x 1024B 1.88x 1.86x 1.45x 1.95x 1.96x 1.95x 1.77x 1.71x 1.77x 1.78x 8192B 1.91x 1.86x 1.42x 1.91x 2.03x 1.98x 1.73x 1.71x 1.78x 1.76x camellia-asm vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.15x 1.22x ecb-dec 1.16x 1.16x cbc-enc 0.85x 0.90x cbc-dec 1.20x 1.23x ctr-enc 1.28x 1.30x ctr-dec 1.27x 1.28x lrw-enc 1.12x 1.16x lrw-dec 1.08x 1.10x xts-enc 1.11x 1.15x xts-dec 1.14x 1.15x Intel Core2 T8100 (fam:6, model:23, step:6): camellia-asm vs camellia_generic: 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.10x 1.12x 1.14x 1.16x 1.16x 1.15x 1.02x 1.02x 1.08x 1.08x 64B 1.61x 1.60x 1.17x 1.68x 1.67x 1.66x 1.43x 1.42x 1.44x 1.42x 256B 1.65x 1.73x 1.17x 1.77x 1.81x 1.80x 1.54x 1.53x 1.58x 1.54x 1024B 1.76x 1.74x 1.18x 1.80x 1.85x 1.85x 1.60x 1.59x 1.65x 1.60x 8192B 1.77x 1.75x 1.19x 1.81x 1.85x 1.86x 1.63x 1.61x 1.66x 1.62x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.10x 1.07x 1.13x 1.16x 1.11x 1.16x 1.03x 1.02x 1.08x 1.07x 64B 1.61x 1.62x 1.15x 1.66x 1.63x 1.68x 1.47x 1.46x 1.47x 1.44x 256B 1.71x 1.70x 1.16x 1.75x 1.69x 1.79x 1.58x 1.57x 1.59x 1.55x 1024B 1.78x 1.72x 1.17x 1.75x 1.80x 1.80x 1.63x 1.62x 1.65x 1.62x 8192B 1.76x 1.73x 1.17x 1.78x 1.80x 1.81x 1.64x 1.62x 1.68x 1.64x camellia-asm vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.17x 1.21x ecb-dec 1.17x 1.20x cbc-enc 0.80x 0.82x cbc-dec 1.22x 1.24x ctr-enc 1.25x 1.26x ctr-dec 1.25x 1.26x lrw-enc 1.14x 1.18x lrw-dec 1.13x 1.17x xts-enc 1.14x 1.18x xts-dec 1.14x 1.17x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/camellia-x86_64-asm_64.S | 520 ++++++++ arch/x86/crypto/camellia_glue.c | 1952 ++++++++++++++++++++++++++++++ crypto/Kconfig | 18 + 4 files changed, 2492 insertions(+) create mode 100644 arch/x86/crypto/camellia-x86_64-asm_64.S create mode 100644 arch/x86/crypto/camellia_glue.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2b0b9631474b..e191ac048b59 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o @@ -25,6 +26,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S new file mode 100644 index 000000000000..0b3374335fdc --- /dev/null +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -0,0 +1,520 @@ +/* + * Camellia Cipher Algorithm (x86_64) + * + * Copyright (C) 2012 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "camellia-x86_64-asm_64.S" +.text + +.extern camellia_sp10011110; +.extern camellia_sp22000222; +.extern camellia_sp03303033; +.extern camellia_sp00444404; +.extern camellia_sp02220222; +.extern camellia_sp30333033; +.extern camellia_sp44044404; +.extern camellia_sp11101110; + +#define sp10011110 camellia_sp10011110 +#define sp22000222 camellia_sp22000222 +#define sp03303033 camellia_sp03303033 +#define sp00444404 camellia_sp00444404 +#define sp02220222 camellia_sp02220222 +#define sp30333033 camellia_sp30333033 +#define sp44044404 camellia_sp44044404 +#define sp11101110 camellia_sp11101110 + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct camellia_ctx: */ +#define key_table 0 +#define key_length CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi +#define RIO %rsi +#define RIOd %esi + +#define RAB0 %rax +#define RCD0 %rcx +#define RAB1 %rbx +#define RCD1 %rdx + +#define RAB0d %eax +#define RCD0d %ecx +#define RAB1d %ebx +#define RCD1d %edx + +#define RAB0bl %al +#define RCD0bl %cl +#define RAB1bl %bl +#define RCD1bl %dl + +#define RAB0bh %ah +#define RCD0bh %ch +#define RAB1bh %bh +#define RCD1bh %dh + +#define RT0 %rsi +#define RT1 %rbp +#define RT2 %r8 + +#define RT0d %esi +#define RT1d %ebp +#define RT2d %r8d + +#define RT2bl %r8b + +#define RXOR %r9 +#define RRBP %r10 +#define RDST %r11 + +#define RXORd %r9d +#define RXORbl %r9b + +#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ + movzbl ab ## bl, tmp2 ## d; \ + movzbl ab ## bh, tmp1 ## d; \ + rorq $16, ab; \ + xorq T0(, tmp2, 8), dst; \ + xorq T1(, tmp1, 8), dst; + +/********************************************************************** + 1-way camellia + **********************************************************************/ +#define roundsm(ab, subkey, cd) \ + movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ + \ + xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ + xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ + xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ + xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ + \ + xorq RT2, cd ## 0; + +#define fls(l, r, kl, kr) \ + movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ + andl l ## 0d, RT0d; \ + roll $1, RT0d; \ + shlq $32, RT0; \ + xorq RT0, l ## 0; \ + movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ + orq r ## 0, RT1; \ + shrq $32, RT1; \ + xorq RT1, r ## 0; \ + \ + movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ + orq l ## 0, RT2; \ + shrq $32, RT2; \ + xorq RT2, l ## 0; \ + movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ + andl r ## 0d, RT0d; \ + roll $1, RT0d; \ + shlq $32, RT0; \ + xorq RT0, r ## 0; + +#define enc_rounds(i) \ + roundsm(RAB, i + 2, RCD); \ + roundsm(RCD, i + 3, RAB); \ + roundsm(RAB, i + 4, RCD); \ + roundsm(RCD, i + 5, RAB); \ + roundsm(RAB, i + 6, RCD); \ + roundsm(RCD, i + 7, RAB); + +#define enc_fls(i) \ + fls(RAB, RCD, i + 0, i + 1); + +#define enc_inpack() \ + movq (RIO), RAB0; \ + bswapq RAB0; \ + rolq $32, RAB0; \ + movq 4*2(RIO), RCD0; \ + bswapq RCD0; \ + rorq $32, RCD0; \ + xorq key_table(CTX), RAB0; + +#define enc_outunpack(op, max) \ + xorq key_table(CTX, max, 8), RCD0; \ + rorq $32, RCD0; \ + bswapq RCD0; \ + op ## q RCD0, (RIO); \ + rolq $32, RAB0; \ + bswapq RAB0; \ + op ## q RAB0, 4*2(RIO); + +#define dec_rounds(i) \ + roundsm(RAB, i + 7, RCD); \ + roundsm(RCD, i + 6, RAB); \ + roundsm(RAB, i + 5, RCD); \ + roundsm(RCD, i + 4, RAB); \ + roundsm(RAB, i + 3, RCD); \ + roundsm(RCD, i + 2, RAB); + +#define dec_fls(i) \ + fls(RAB, RCD, i + 1, i + 0); + +#define dec_inpack(max) \ + movq (RIO), RAB0; \ + bswapq RAB0; \ + rolq $32, RAB0; \ + movq 4*2(RIO), RCD0; \ + bswapq RCD0; \ + rorq $32, RCD0; \ + xorq key_table(CTX, max, 8), RAB0; + +#define dec_outunpack() \ + xorq key_table(CTX), RCD0; \ + rorq $32, RCD0; \ + bswapq RCD0; \ + movq RCD0, (RIO); \ + rolq $32, RAB0; \ + bswapq RAB0; \ + movq RAB0, 4*2(RIO); + +.global __camellia_enc_blk; +.type __camellia_enc_blk,@function; + +__camellia_enc_blk: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool xor + */ + movq %rbp, RRBP; + + movq %rcx, RXOR; + movq %rsi, RDST; + movq %rdx, RIO; + + enc_inpack(); + + enc_rounds(0); + enc_fls(8); + enc_rounds(8); + enc_fls(16); + enc_rounds(16); + movl $24, RT1d; /* max */ + + cmpb $16, key_length(CTX); + je __enc_done; + + enc_fls(24); + enc_rounds(24); + movl $32, RT1d; /* max */ + +__enc_done: + testb RXORbl, RXORbl; + movq RDST, RIO; + + jnz __enc_xor; + + enc_outunpack(mov, RT1); + + movq RRBP, %rbp; + ret; + +__enc_xor: + enc_outunpack(xor, RT1); + + movq RRBP, %rbp; + ret; + +.global camellia_dec_blk; +.type camellia_dec_blk,@function; + +camellia_dec_blk: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + cmpl $16, key_length(CTX); + movl $32, RT2d; + movl $24, RXORd; + cmovel RXORd, RT2d; /* max */ + + movq %rbp, RRBP; + movq %rsi, RDST; + movq %rdx, RIO; + + dec_inpack(RT2); + + cmpb $24, RT2bl; + je __dec_rounds16; + + dec_rounds(24); + dec_fls(24); + +__dec_rounds16: + dec_rounds(16); + dec_fls(16); + dec_rounds(8); + dec_fls(8); + dec_rounds(0); + + movq RDST, RIO; + + dec_outunpack(); + + movq RRBP, %rbp; + ret; + +/********************************************************************** + 2-way camellia + **********************************************************************/ +#define roundsm2(ab, subkey, cd) \ + movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ + xorq RT2, cd ## 1; \ + \ + xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ + xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ + xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ + xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ + \ + xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ + xorq RT2, cd ## 0; \ + xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ + xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ + xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); + +#define fls2(l, r, kl, kr) \ + movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ + andl l ## 0d, RT0d; \ + roll $1, RT0d; \ + shlq $32, RT0; \ + xorq RT0, l ## 0; \ + movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ + orq r ## 0, RT1; \ + shrq $32, RT1; \ + xorq RT1, r ## 0; \ + \ + movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ + andl l ## 1d, RT2d; \ + roll $1, RT2d; \ + shlq $32, RT2; \ + xorq RT2, l ## 1; \ + movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ + orq r ## 1, RT0; \ + shrq $32, RT0; \ + xorq RT0, r ## 1; \ + \ + movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ + orq l ## 0, RT1; \ + shrq $32, RT1; \ + xorq RT1, l ## 0; \ + movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ + andl r ## 0d, RT2d; \ + roll $1, RT2d; \ + shlq $32, RT2; \ + xorq RT2, r ## 0; \ + \ + movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ + orq l ## 1, RT0; \ + shrq $32, RT0; \ + xorq RT0, l ## 1; \ + movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ + andl r ## 1d, RT1d; \ + roll $1, RT1d; \ + shlq $32, RT1; \ + xorq RT1, r ## 1; + +#define enc_rounds2(i) \ + roundsm2(RAB, i + 2, RCD); \ + roundsm2(RCD, i + 3, RAB); \ + roundsm2(RAB, i + 4, RCD); \ + roundsm2(RCD, i + 5, RAB); \ + roundsm2(RAB, i + 6, RCD); \ + roundsm2(RCD, i + 7, RAB); + +#define enc_fls2(i) \ + fls2(RAB, RCD, i + 0, i + 1); + +#define enc_inpack2() \ + movq (RIO), RAB0; \ + bswapq RAB0; \ + rorq $32, RAB0; \ + movq 4*2(RIO), RCD0; \ + bswapq RCD0; \ + rolq $32, RCD0; \ + xorq key_table(CTX), RAB0; \ + \ + movq 8*2(RIO), RAB1; \ + bswapq RAB1; \ + rorq $32, RAB1; \ + movq 12*2(RIO), RCD1; \ + bswapq RCD1; \ + rolq $32, RCD1; \ + xorq key_table(CTX), RAB1; + +#define enc_outunpack2(op, max) \ + xorq key_table(CTX, max, 8), RCD0; \ + rolq $32, RCD0; \ + bswapq RCD0; \ + op ## q RCD0, (RIO); \ + rorq $32, RAB0; \ + bswapq RAB0; \ + op ## q RAB0, 4*2(RIO); \ + \ + xorq key_table(CTX, max, 8), RCD1; \ + rolq $32, RCD1; \ + bswapq RCD1; \ + op ## q RCD1, 8*2(RIO); \ + rorq $32, RAB1; \ + bswapq RAB1; \ + op ## q RAB1, 12*2(RIO); + +#define dec_rounds2(i) \ + roundsm2(RAB, i + 7, RCD); \ + roundsm2(RCD, i + 6, RAB); \ + roundsm2(RAB, i + 5, RCD); \ + roundsm2(RCD, i + 4, RAB); \ + roundsm2(RAB, i + 3, RCD); \ + roundsm2(RCD, i + 2, RAB); + +#define dec_fls2(i) \ + fls2(RAB, RCD, i + 1, i + 0); + +#define dec_inpack2(max) \ + movq (RIO), RAB0; \ + bswapq RAB0; \ + rorq $32, RAB0; \ + movq 4*2(RIO), RCD0; \ + bswapq RCD0; \ + rolq $32, RCD0; \ + xorq key_table(CTX, max, 8), RAB0; \ + \ + movq 8*2(RIO), RAB1; \ + bswapq RAB1; \ + rorq $32, RAB1; \ + movq 12*2(RIO), RCD1; \ + bswapq RCD1; \ + rolq $32, RCD1; \ + xorq key_table(CTX, max, 8), RAB1; + +#define dec_outunpack2() \ + xorq key_table(CTX), RCD0; \ + rolq $32, RCD0; \ + bswapq RCD0; \ + movq RCD0, (RIO); \ + rorq $32, RAB0; \ + bswapq RAB0; \ + movq RAB0, 4*2(RIO); \ + \ + xorq key_table(CTX), RCD1; \ + rolq $32, RCD1; \ + bswapq RCD1; \ + movq RCD1, 8*2(RIO); \ + rorq $32, RAB1; \ + bswapq RAB1; \ + movq RAB1, 12*2(RIO); + +.global __camellia_enc_blk_2way; +.type __camellia_enc_blk_2way,@function; + +__camellia_enc_blk_2way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool xor + */ + pushq %rbx; + + movq %rbp, RRBP; + movq %rcx, RXOR; + movq %rsi, RDST; + movq %rdx, RIO; + + enc_inpack2(); + + enc_rounds2(0); + enc_fls2(8); + enc_rounds2(8); + enc_fls2(16); + enc_rounds2(16); + movl $24, RT2d; /* max */ + + cmpb $16, key_length(CTX); + je __enc2_done; + + enc_fls2(24); + enc_rounds2(24); + movl $32, RT2d; /* max */ + +__enc2_done: + test RXORbl, RXORbl; + movq RDST, RIO; + jnz __enc2_xor; + + enc_outunpack2(mov, RT2); + + movq RRBP, %rbp; + popq %rbx; + ret; + +__enc2_xor: + enc_outunpack2(xor, RT2); + + movq RRBP, %rbp; + popq %rbx; + ret; + +.global camellia_dec_blk_2way; +.type camellia_dec_blk_2way,@function; + +camellia_dec_blk_2way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + cmpl $16, key_length(CTX); + movl $32, RT2d; + movl $24, RXORd; + cmovel RXORd, RT2d; /* max */ + + movq %rbx, RXOR; + movq %rbp, RRBP; + movq %rsi, RDST; + movq %rdx, RIO; + + dec_inpack2(RT2); + + cmpb $24, RT2bl; + je __dec2_rounds16; + + dec_rounds2(24); + dec_fls2(24); + +__dec2_rounds16: + dec_rounds2(16); + dec_fls2(16); + dec_rounds2(8); + dec_fls2(8); + dec_rounds2(0); + + movq RDST, RIO; + + dec_outunpack2(); + + movq RRBP, %rbp; + movq RXOR, %rbx; + ret; diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c new file mode 100644 index 000000000000..1ca36a93fd2f --- /dev/null +++ b/arch/x86/crypto/camellia_glue.c @@ -0,0 +1,1952 @@ +/* + * Glue Code for assembler optimized version of Camellia + * + * Copyright (c) 2012 Jussi Kivilinna + * + * Camellia parts based on code by: + * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAMELLIA_MIN_KEY_SIZE 16 +#define CAMELLIA_MAX_KEY_SIZE 32 +#define CAMELLIA_BLOCK_SIZE 16 +#define CAMELLIA_TABLE_BYTE_LEN 272 + +struct camellia_ctx { + u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + u32 key_length; +}; + +/* regular block cipher functions */ +asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +/* 2-way parallel cipher functions */ +asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, true); +} + +static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, true); +} + +static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src); +} + +/* camellia sboxes */ +const u64 camellia_sp10011110[256] = { + 0x7000007070707000, 0x8200008282828200, 0x2c00002c2c2c2c00, + 0xec0000ecececec00, 0xb30000b3b3b3b300, 0x2700002727272700, + 0xc00000c0c0c0c000, 0xe50000e5e5e5e500, 0xe40000e4e4e4e400, + 0x8500008585858500, 0x5700005757575700, 0x3500003535353500, + 0xea0000eaeaeaea00, 0x0c00000c0c0c0c00, 0xae0000aeaeaeae00, + 0x4100004141414100, 0x2300002323232300, 0xef0000efefefef00, + 0x6b00006b6b6b6b00, 0x9300009393939300, 0x4500004545454500, + 0x1900001919191900, 0xa50000a5a5a5a500, 0x2100002121212100, + 0xed0000edededed00, 0x0e00000e0e0e0e00, 0x4f00004f4f4f4f00, + 0x4e00004e4e4e4e00, 0x1d00001d1d1d1d00, 0x6500006565656500, + 0x9200009292929200, 0xbd0000bdbdbdbd00, 0x8600008686868600, + 0xb80000b8b8b8b800, 0xaf0000afafafaf00, 0x8f00008f8f8f8f00, + 0x7c00007c7c7c7c00, 0xeb0000ebebebeb00, 0x1f00001f1f1f1f00, + 0xce0000cececece00, 0x3e00003e3e3e3e00, 0x3000003030303000, + 0xdc0000dcdcdcdc00, 0x5f00005f5f5f5f00, 0x5e00005e5e5e5e00, + 0xc50000c5c5c5c500, 0x0b00000b0b0b0b00, 0x1a00001a1a1a1a00, + 0xa60000a6a6a6a600, 0xe10000e1e1e1e100, 0x3900003939393900, + 0xca0000cacacaca00, 0xd50000d5d5d5d500, 0x4700004747474700, + 0x5d00005d5d5d5d00, 0x3d00003d3d3d3d00, 0xd90000d9d9d9d900, + 0x0100000101010100, 0x5a00005a5a5a5a00, 0xd60000d6d6d6d600, + 0x5100005151515100, 0x5600005656565600, 0x6c00006c6c6c6c00, + 0x4d00004d4d4d4d00, 0x8b00008b8b8b8b00, 0x0d00000d0d0d0d00, + 0x9a00009a9a9a9a00, 0x6600006666666600, 0xfb0000fbfbfbfb00, + 0xcc0000cccccccc00, 0xb00000b0b0b0b000, 0x2d00002d2d2d2d00, + 0x7400007474747400, 0x1200001212121200, 0x2b00002b2b2b2b00, + 0x2000002020202000, 0xf00000f0f0f0f000, 0xb10000b1b1b1b100, + 0x8400008484848400, 0x9900009999999900, 0xdf0000dfdfdfdf00, + 0x4c00004c4c4c4c00, 0xcb0000cbcbcbcb00, 0xc20000c2c2c2c200, + 0x3400003434343400, 0x7e00007e7e7e7e00, 0x7600007676767600, + 0x0500000505050500, 0x6d00006d6d6d6d00, 0xb70000b7b7b7b700, + 0xa90000a9a9a9a900, 0x3100003131313100, 0xd10000d1d1d1d100, + 0x1700001717171700, 0x0400000404040400, 0xd70000d7d7d7d700, + 0x1400001414141400, 0x5800005858585800, 0x3a00003a3a3a3a00, + 0x6100006161616100, 0xde0000dededede00, 0x1b00001b1b1b1b00, + 0x1100001111111100, 0x1c00001c1c1c1c00, 0x3200003232323200, + 0x0f00000f0f0f0f00, 0x9c00009c9c9c9c00, 0x1600001616161600, + 0x5300005353535300, 0x1800001818181800, 0xf20000f2f2f2f200, + 0x2200002222222200, 0xfe0000fefefefe00, 0x4400004444444400, + 0xcf0000cfcfcfcf00, 0xb20000b2b2b2b200, 0xc30000c3c3c3c300, + 0xb50000b5b5b5b500, 0x7a00007a7a7a7a00, 0x9100009191919100, + 0x2400002424242400, 0x0800000808080800, 0xe80000e8e8e8e800, + 0xa80000a8a8a8a800, 0x6000006060606000, 0xfc0000fcfcfcfc00, + 0x6900006969696900, 0x5000005050505000, 0xaa0000aaaaaaaa00, + 0xd00000d0d0d0d000, 0xa00000a0a0a0a000, 0x7d00007d7d7d7d00, + 0xa10000a1a1a1a100, 0x8900008989898900, 0x6200006262626200, + 0x9700009797979700, 0x5400005454545400, 0x5b00005b5b5b5b00, + 0x1e00001e1e1e1e00, 0x9500009595959500, 0xe00000e0e0e0e000, + 0xff0000ffffffff00, 0x6400006464646400, 0xd20000d2d2d2d200, + 0x1000001010101000, 0xc40000c4c4c4c400, 0x0000000000000000, + 0x4800004848484800, 0xa30000a3a3a3a300, 0xf70000f7f7f7f700, + 0x7500007575757500, 0xdb0000dbdbdbdb00, 0x8a00008a8a8a8a00, + 0x0300000303030300, 0xe60000e6e6e6e600, 0xda0000dadadada00, + 0x0900000909090900, 0x3f00003f3f3f3f00, 0xdd0000dddddddd00, + 0x9400009494949400, 0x8700008787878700, 0x5c00005c5c5c5c00, + 0x8300008383838300, 0x0200000202020200, 0xcd0000cdcdcdcd00, + 0x4a00004a4a4a4a00, 0x9000009090909000, 0x3300003333333300, + 0x7300007373737300, 0x6700006767676700, 0xf60000f6f6f6f600, + 0xf30000f3f3f3f300, 0x9d00009d9d9d9d00, 0x7f00007f7f7f7f00, + 0xbf0000bfbfbfbf00, 0xe20000e2e2e2e200, 0x5200005252525200, + 0x9b00009b9b9b9b00, 0xd80000d8d8d8d800, 0x2600002626262600, + 0xc80000c8c8c8c800, 0x3700003737373700, 0xc60000c6c6c6c600, + 0x3b00003b3b3b3b00, 0x8100008181818100, 0x9600009696969600, + 0x6f00006f6f6f6f00, 0x4b00004b4b4b4b00, 0x1300001313131300, + 0xbe0000bebebebe00, 0x6300006363636300, 0x2e00002e2e2e2e00, + 0xe90000e9e9e9e900, 0x7900007979797900, 0xa70000a7a7a7a700, + 0x8c00008c8c8c8c00, 0x9f00009f9f9f9f00, 0x6e00006e6e6e6e00, + 0xbc0000bcbcbcbc00, 0x8e00008e8e8e8e00, 0x2900002929292900, + 0xf50000f5f5f5f500, 0xf90000f9f9f9f900, 0xb60000b6b6b6b600, + 0x2f00002f2f2f2f00, 0xfd0000fdfdfdfd00, 0xb40000b4b4b4b400, + 0x5900005959595900, 0x7800007878787800, 0x9800009898989800, + 0x0600000606060600, 0x6a00006a6a6a6a00, 0xe70000e7e7e7e700, + 0x4600004646464600, 0x7100007171717100, 0xba0000babababa00, + 0xd40000d4d4d4d400, 0x2500002525252500, 0xab0000abababab00, + 0x4200004242424200, 0x8800008888888800, 0xa20000a2a2a2a200, + 0x8d00008d8d8d8d00, 0xfa0000fafafafa00, 0x7200007272727200, + 0x0700000707070700, 0xb90000b9b9b9b900, 0x5500005555555500, + 0xf80000f8f8f8f800, 0xee0000eeeeeeee00, 0xac0000acacacac00, + 0x0a00000a0a0a0a00, 0x3600003636363600, 0x4900004949494900, + 0x2a00002a2a2a2a00, 0x6800006868686800, 0x3c00003c3c3c3c00, + 0x3800003838383800, 0xf10000f1f1f1f100, 0xa40000a4a4a4a400, + 0x4000004040404000, 0x2800002828282800, 0xd30000d3d3d3d300, + 0x7b00007b7b7b7b00, 0xbb0000bbbbbbbb00, 0xc90000c9c9c9c900, + 0x4300004343434300, 0xc10000c1c1c1c100, 0x1500001515151500, + 0xe30000e3e3e3e300, 0xad0000adadadad00, 0xf40000f4f4f4f400, + 0x7700007777777700, 0xc70000c7c7c7c700, 0x8000008080808000, + 0x9e00009e9e9e9e00, +}; + +const u64 camellia_sp22000222[256] = { + 0xe0e0000000e0e0e0, 0x0505000000050505, 0x5858000000585858, + 0xd9d9000000d9d9d9, 0x6767000000676767, 0x4e4e0000004e4e4e, + 0x8181000000818181, 0xcbcb000000cbcbcb, 0xc9c9000000c9c9c9, + 0x0b0b0000000b0b0b, 0xaeae000000aeaeae, 0x6a6a0000006a6a6a, + 0xd5d5000000d5d5d5, 0x1818000000181818, 0x5d5d0000005d5d5d, + 0x8282000000828282, 0x4646000000464646, 0xdfdf000000dfdfdf, + 0xd6d6000000d6d6d6, 0x2727000000272727, 0x8a8a0000008a8a8a, + 0x3232000000323232, 0x4b4b0000004b4b4b, 0x4242000000424242, + 0xdbdb000000dbdbdb, 0x1c1c0000001c1c1c, 0x9e9e0000009e9e9e, + 0x9c9c0000009c9c9c, 0x3a3a0000003a3a3a, 0xcaca000000cacaca, + 0x2525000000252525, 0x7b7b0000007b7b7b, 0x0d0d0000000d0d0d, + 0x7171000000717171, 0x5f5f0000005f5f5f, 0x1f1f0000001f1f1f, + 0xf8f8000000f8f8f8, 0xd7d7000000d7d7d7, 0x3e3e0000003e3e3e, + 0x9d9d0000009d9d9d, 0x7c7c0000007c7c7c, 0x6060000000606060, + 0xb9b9000000b9b9b9, 0xbebe000000bebebe, 0xbcbc000000bcbcbc, + 0x8b8b0000008b8b8b, 0x1616000000161616, 0x3434000000343434, + 0x4d4d0000004d4d4d, 0xc3c3000000c3c3c3, 0x7272000000727272, + 0x9595000000959595, 0xabab000000ababab, 0x8e8e0000008e8e8e, + 0xbaba000000bababa, 0x7a7a0000007a7a7a, 0xb3b3000000b3b3b3, + 0x0202000000020202, 0xb4b4000000b4b4b4, 0xadad000000adadad, + 0xa2a2000000a2a2a2, 0xacac000000acacac, 0xd8d8000000d8d8d8, + 0x9a9a0000009a9a9a, 0x1717000000171717, 0x1a1a0000001a1a1a, + 0x3535000000353535, 0xcccc000000cccccc, 0xf7f7000000f7f7f7, + 0x9999000000999999, 0x6161000000616161, 0x5a5a0000005a5a5a, + 0xe8e8000000e8e8e8, 0x2424000000242424, 0x5656000000565656, + 0x4040000000404040, 0xe1e1000000e1e1e1, 0x6363000000636363, + 0x0909000000090909, 0x3333000000333333, 0xbfbf000000bfbfbf, + 0x9898000000989898, 0x9797000000979797, 0x8585000000858585, + 0x6868000000686868, 0xfcfc000000fcfcfc, 0xecec000000ececec, + 0x0a0a0000000a0a0a, 0xdada000000dadada, 0x6f6f0000006f6f6f, + 0x5353000000535353, 0x6262000000626262, 0xa3a3000000a3a3a3, + 0x2e2e0000002e2e2e, 0x0808000000080808, 0xafaf000000afafaf, + 0x2828000000282828, 0xb0b0000000b0b0b0, 0x7474000000747474, + 0xc2c2000000c2c2c2, 0xbdbd000000bdbdbd, 0x3636000000363636, + 0x2222000000222222, 0x3838000000383838, 0x6464000000646464, + 0x1e1e0000001e1e1e, 0x3939000000393939, 0x2c2c0000002c2c2c, + 0xa6a6000000a6a6a6, 0x3030000000303030, 0xe5e5000000e5e5e5, + 0x4444000000444444, 0xfdfd000000fdfdfd, 0x8888000000888888, + 0x9f9f0000009f9f9f, 0x6565000000656565, 0x8787000000878787, + 0x6b6b0000006b6b6b, 0xf4f4000000f4f4f4, 0x2323000000232323, + 0x4848000000484848, 0x1010000000101010, 0xd1d1000000d1d1d1, + 0x5151000000515151, 0xc0c0000000c0c0c0, 0xf9f9000000f9f9f9, + 0xd2d2000000d2d2d2, 0xa0a0000000a0a0a0, 0x5555000000555555, + 0xa1a1000000a1a1a1, 0x4141000000414141, 0xfafa000000fafafa, + 0x4343000000434343, 0x1313000000131313, 0xc4c4000000c4c4c4, + 0x2f2f0000002f2f2f, 0xa8a8000000a8a8a8, 0xb6b6000000b6b6b6, + 0x3c3c0000003c3c3c, 0x2b2b0000002b2b2b, 0xc1c1000000c1c1c1, + 0xffff000000ffffff, 0xc8c8000000c8c8c8, 0xa5a5000000a5a5a5, + 0x2020000000202020, 0x8989000000898989, 0x0000000000000000, + 0x9090000000909090, 0x4747000000474747, 0xefef000000efefef, + 0xeaea000000eaeaea, 0xb7b7000000b7b7b7, 0x1515000000151515, + 0x0606000000060606, 0xcdcd000000cdcdcd, 0xb5b5000000b5b5b5, + 0x1212000000121212, 0x7e7e0000007e7e7e, 0xbbbb000000bbbbbb, + 0x2929000000292929, 0x0f0f0000000f0f0f, 0xb8b8000000b8b8b8, + 0x0707000000070707, 0x0404000000040404, 0x9b9b0000009b9b9b, + 0x9494000000949494, 0x2121000000212121, 0x6666000000666666, + 0xe6e6000000e6e6e6, 0xcece000000cecece, 0xeded000000ededed, + 0xe7e7000000e7e7e7, 0x3b3b0000003b3b3b, 0xfefe000000fefefe, + 0x7f7f0000007f7f7f, 0xc5c5000000c5c5c5, 0xa4a4000000a4a4a4, + 0x3737000000373737, 0xb1b1000000b1b1b1, 0x4c4c0000004c4c4c, + 0x9191000000919191, 0x6e6e0000006e6e6e, 0x8d8d0000008d8d8d, + 0x7676000000767676, 0x0303000000030303, 0x2d2d0000002d2d2d, + 0xdede000000dedede, 0x9696000000969696, 0x2626000000262626, + 0x7d7d0000007d7d7d, 0xc6c6000000c6c6c6, 0x5c5c0000005c5c5c, + 0xd3d3000000d3d3d3, 0xf2f2000000f2f2f2, 0x4f4f0000004f4f4f, + 0x1919000000191919, 0x3f3f0000003f3f3f, 0xdcdc000000dcdcdc, + 0x7979000000797979, 0x1d1d0000001d1d1d, 0x5252000000525252, + 0xebeb000000ebebeb, 0xf3f3000000f3f3f3, 0x6d6d0000006d6d6d, + 0x5e5e0000005e5e5e, 0xfbfb000000fbfbfb, 0x6969000000696969, + 0xb2b2000000b2b2b2, 0xf0f0000000f0f0f0, 0x3131000000313131, + 0x0c0c0000000c0c0c, 0xd4d4000000d4d4d4, 0xcfcf000000cfcfcf, + 0x8c8c0000008c8c8c, 0xe2e2000000e2e2e2, 0x7575000000757575, + 0xa9a9000000a9a9a9, 0x4a4a0000004a4a4a, 0x5757000000575757, + 0x8484000000848484, 0x1111000000111111, 0x4545000000454545, + 0x1b1b0000001b1b1b, 0xf5f5000000f5f5f5, 0xe4e4000000e4e4e4, + 0x0e0e0000000e0e0e, 0x7373000000737373, 0xaaaa000000aaaaaa, + 0xf1f1000000f1f1f1, 0xdddd000000dddddd, 0x5959000000595959, + 0x1414000000141414, 0x6c6c0000006c6c6c, 0x9292000000929292, + 0x5454000000545454, 0xd0d0000000d0d0d0, 0x7878000000787878, + 0x7070000000707070, 0xe3e3000000e3e3e3, 0x4949000000494949, + 0x8080000000808080, 0x5050000000505050, 0xa7a7000000a7a7a7, + 0xf6f6000000f6f6f6, 0x7777000000777777, 0x9393000000939393, + 0x8686000000868686, 0x8383000000838383, 0x2a2a0000002a2a2a, + 0xc7c7000000c7c7c7, 0x5b5b0000005b5b5b, 0xe9e9000000e9e9e9, + 0xeeee000000eeeeee, 0x8f8f0000008f8f8f, 0x0101000000010101, + 0x3d3d0000003d3d3d, +}; + +const u64 camellia_sp03303033[256] = { + 0x0038380038003838, 0x0041410041004141, 0x0016160016001616, + 0x0076760076007676, 0x00d9d900d900d9d9, 0x0093930093009393, + 0x0060600060006060, 0x00f2f200f200f2f2, 0x0072720072007272, + 0x00c2c200c200c2c2, 0x00abab00ab00abab, 0x009a9a009a009a9a, + 0x0075750075007575, 0x0006060006000606, 0x0057570057005757, + 0x00a0a000a000a0a0, 0x0091910091009191, 0x00f7f700f700f7f7, + 0x00b5b500b500b5b5, 0x00c9c900c900c9c9, 0x00a2a200a200a2a2, + 0x008c8c008c008c8c, 0x00d2d200d200d2d2, 0x0090900090009090, + 0x00f6f600f600f6f6, 0x0007070007000707, 0x00a7a700a700a7a7, + 0x0027270027002727, 0x008e8e008e008e8e, 0x00b2b200b200b2b2, + 0x0049490049004949, 0x00dede00de00dede, 0x0043430043004343, + 0x005c5c005c005c5c, 0x00d7d700d700d7d7, 0x00c7c700c700c7c7, + 0x003e3e003e003e3e, 0x00f5f500f500f5f5, 0x008f8f008f008f8f, + 0x0067670067006767, 0x001f1f001f001f1f, 0x0018180018001818, + 0x006e6e006e006e6e, 0x00afaf00af00afaf, 0x002f2f002f002f2f, + 0x00e2e200e200e2e2, 0x0085850085008585, 0x000d0d000d000d0d, + 0x0053530053005353, 0x00f0f000f000f0f0, 0x009c9c009c009c9c, + 0x0065650065006565, 0x00eaea00ea00eaea, 0x00a3a300a300a3a3, + 0x00aeae00ae00aeae, 0x009e9e009e009e9e, 0x00ecec00ec00ecec, + 0x0080800080008080, 0x002d2d002d002d2d, 0x006b6b006b006b6b, + 0x00a8a800a800a8a8, 0x002b2b002b002b2b, 0x0036360036003636, + 0x00a6a600a600a6a6, 0x00c5c500c500c5c5, 0x0086860086008686, + 0x004d4d004d004d4d, 0x0033330033003333, 0x00fdfd00fd00fdfd, + 0x0066660066006666, 0x0058580058005858, 0x0096960096009696, + 0x003a3a003a003a3a, 0x0009090009000909, 0x0095950095009595, + 0x0010100010001010, 0x0078780078007878, 0x00d8d800d800d8d8, + 0x0042420042004242, 0x00cccc00cc00cccc, 0x00efef00ef00efef, + 0x0026260026002626, 0x00e5e500e500e5e5, 0x0061610061006161, + 0x001a1a001a001a1a, 0x003f3f003f003f3f, 0x003b3b003b003b3b, + 0x0082820082008282, 0x00b6b600b600b6b6, 0x00dbdb00db00dbdb, + 0x00d4d400d400d4d4, 0x0098980098009898, 0x00e8e800e800e8e8, + 0x008b8b008b008b8b, 0x0002020002000202, 0x00ebeb00eb00ebeb, + 0x000a0a000a000a0a, 0x002c2c002c002c2c, 0x001d1d001d001d1d, + 0x00b0b000b000b0b0, 0x006f6f006f006f6f, 0x008d8d008d008d8d, + 0x0088880088008888, 0x000e0e000e000e0e, 0x0019190019001919, + 0x0087870087008787, 0x004e4e004e004e4e, 0x000b0b000b000b0b, + 0x00a9a900a900a9a9, 0x000c0c000c000c0c, 0x0079790079007979, + 0x0011110011001111, 0x007f7f007f007f7f, 0x0022220022002222, + 0x00e7e700e700e7e7, 0x0059590059005959, 0x00e1e100e100e1e1, + 0x00dada00da00dada, 0x003d3d003d003d3d, 0x00c8c800c800c8c8, + 0x0012120012001212, 0x0004040004000404, 0x0074740074007474, + 0x0054540054005454, 0x0030300030003030, 0x007e7e007e007e7e, + 0x00b4b400b400b4b4, 0x0028280028002828, 0x0055550055005555, + 0x0068680068006868, 0x0050500050005050, 0x00bebe00be00bebe, + 0x00d0d000d000d0d0, 0x00c4c400c400c4c4, 0x0031310031003131, + 0x00cbcb00cb00cbcb, 0x002a2a002a002a2a, 0x00adad00ad00adad, + 0x000f0f000f000f0f, 0x00caca00ca00caca, 0x0070700070007070, + 0x00ffff00ff00ffff, 0x0032320032003232, 0x0069690069006969, + 0x0008080008000808, 0x0062620062006262, 0x0000000000000000, + 0x0024240024002424, 0x00d1d100d100d1d1, 0x00fbfb00fb00fbfb, + 0x00baba00ba00baba, 0x00eded00ed00eded, 0x0045450045004545, + 0x0081810081008181, 0x0073730073007373, 0x006d6d006d006d6d, + 0x0084840084008484, 0x009f9f009f009f9f, 0x00eeee00ee00eeee, + 0x004a4a004a004a4a, 0x00c3c300c300c3c3, 0x002e2e002e002e2e, + 0x00c1c100c100c1c1, 0x0001010001000101, 0x00e6e600e600e6e6, + 0x0025250025002525, 0x0048480048004848, 0x0099990099009999, + 0x00b9b900b900b9b9, 0x00b3b300b300b3b3, 0x007b7b007b007b7b, + 0x00f9f900f900f9f9, 0x00cece00ce00cece, 0x00bfbf00bf00bfbf, + 0x00dfdf00df00dfdf, 0x0071710071007171, 0x0029290029002929, + 0x00cdcd00cd00cdcd, 0x006c6c006c006c6c, 0x0013130013001313, + 0x0064640064006464, 0x009b9b009b009b9b, 0x0063630063006363, + 0x009d9d009d009d9d, 0x00c0c000c000c0c0, 0x004b4b004b004b4b, + 0x00b7b700b700b7b7, 0x00a5a500a500a5a5, 0x0089890089008989, + 0x005f5f005f005f5f, 0x00b1b100b100b1b1, 0x0017170017001717, + 0x00f4f400f400f4f4, 0x00bcbc00bc00bcbc, 0x00d3d300d300d3d3, + 0x0046460046004646, 0x00cfcf00cf00cfcf, 0x0037370037003737, + 0x005e5e005e005e5e, 0x0047470047004747, 0x0094940094009494, + 0x00fafa00fa00fafa, 0x00fcfc00fc00fcfc, 0x005b5b005b005b5b, + 0x0097970097009797, 0x00fefe00fe00fefe, 0x005a5a005a005a5a, + 0x00acac00ac00acac, 0x003c3c003c003c3c, 0x004c4c004c004c4c, + 0x0003030003000303, 0x0035350035003535, 0x00f3f300f300f3f3, + 0x0023230023002323, 0x00b8b800b800b8b8, 0x005d5d005d005d5d, + 0x006a6a006a006a6a, 0x0092920092009292, 0x00d5d500d500d5d5, + 0x0021210021002121, 0x0044440044004444, 0x0051510051005151, + 0x00c6c600c600c6c6, 0x007d7d007d007d7d, 0x0039390039003939, + 0x0083830083008383, 0x00dcdc00dc00dcdc, 0x00aaaa00aa00aaaa, + 0x007c7c007c007c7c, 0x0077770077007777, 0x0056560056005656, + 0x0005050005000505, 0x001b1b001b001b1b, 0x00a4a400a400a4a4, + 0x0015150015001515, 0x0034340034003434, 0x001e1e001e001e1e, + 0x001c1c001c001c1c, 0x00f8f800f800f8f8, 0x0052520052005252, + 0x0020200020002020, 0x0014140014001414, 0x00e9e900e900e9e9, + 0x00bdbd00bd00bdbd, 0x00dddd00dd00dddd, 0x00e4e400e400e4e4, + 0x00a1a100a100a1a1, 0x00e0e000e000e0e0, 0x008a8a008a008a8a, + 0x00f1f100f100f1f1, 0x00d6d600d600d6d6, 0x007a7a007a007a7a, + 0x00bbbb00bb00bbbb, 0x00e3e300e300e3e3, 0x0040400040004040, + 0x004f4f004f004f4f, +}; + +const u64 camellia_sp00444404[256] = { + 0x0000707070700070, 0x00002c2c2c2c002c, 0x0000b3b3b3b300b3, + 0x0000c0c0c0c000c0, 0x0000e4e4e4e400e4, 0x0000575757570057, + 0x0000eaeaeaea00ea, 0x0000aeaeaeae00ae, 0x0000232323230023, + 0x00006b6b6b6b006b, 0x0000454545450045, 0x0000a5a5a5a500a5, + 0x0000edededed00ed, 0x00004f4f4f4f004f, 0x00001d1d1d1d001d, + 0x0000929292920092, 0x0000868686860086, 0x0000afafafaf00af, + 0x00007c7c7c7c007c, 0x00001f1f1f1f001f, 0x00003e3e3e3e003e, + 0x0000dcdcdcdc00dc, 0x00005e5e5e5e005e, 0x00000b0b0b0b000b, + 0x0000a6a6a6a600a6, 0x0000393939390039, 0x0000d5d5d5d500d5, + 0x00005d5d5d5d005d, 0x0000d9d9d9d900d9, 0x00005a5a5a5a005a, + 0x0000515151510051, 0x00006c6c6c6c006c, 0x00008b8b8b8b008b, + 0x00009a9a9a9a009a, 0x0000fbfbfbfb00fb, 0x0000b0b0b0b000b0, + 0x0000747474740074, 0x00002b2b2b2b002b, 0x0000f0f0f0f000f0, + 0x0000848484840084, 0x0000dfdfdfdf00df, 0x0000cbcbcbcb00cb, + 0x0000343434340034, 0x0000767676760076, 0x00006d6d6d6d006d, + 0x0000a9a9a9a900a9, 0x0000d1d1d1d100d1, 0x0000040404040004, + 0x0000141414140014, 0x00003a3a3a3a003a, 0x0000dededede00de, + 0x0000111111110011, 0x0000323232320032, 0x00009c9c9c9c009c, + 0x0000535353530053, 0x0000f2f2f2f200f2, 0x0000fefefefe00fe, + 0x0000cfcfcfcf00cf, 0x0000c3c3c3c300c3, 0x00007a7a7a7a007a, + 0x0000242424240024, 0x0000e8e8e8e800e8, 0x0000606060600060, + 0x0000696969690069, 0x0000aaaaaaaa00aa, 0x0000a0a0a0a000a0, + 0x0000a1a1a1a100a1, 0x0000626262620062, 0x0000545454540054, + 0x00001e1e1e1e001e, 0x0000e0e0e0e000e0, 0x0000646464640064, + 0x0000101010100010, 0x0000000000000000, 0x0000a3a3a3a300a3, + 0x0000757575750075, 0x00008a8a8a8a008a, 0x0000e6e6e6e600e6, + 0x0000090909090009, 0x0000dddddddd00dd, 0x0000878787870087, + 0x0000838383830083, 0x0000cdcdcdcd00cd, 0x0000909090900090, + 0x0000737373730073, 0x0000f6f6f6f600f6, 0x00009d9d9d9d009d, + 0x0000bfbfbfbf00bf, 0x0000525252520052, 0x0000d8d8d8d800d8, + 0x0000c8c8c8c800c8, 0x0000c6c6c6c600c6, 0x0000818181810081, + 0x00006f6f6f6f006f, 0x0000131313130013, 0x0000636363630063, + 0x0000e9e9e9e900e9, 0x0000a7a7a7a700a7, 0x00009f9f9f9f009f, + 0x0000bcbcbcbc00bc, 0x0000292929290029, 0x0000f9f9f9f900f9, + 0x00002f2f2f2f002f, 0x0000b4b4b4b400b4, 0x0000787878780078, + 0x0000060606060006, 0x0000e7e7e7e700e7, 0x0000717171710071, + 0x0000d4d4d4d400d4, 0x0000abababab00ab, 0x0000888888880088, + 0x00008d8d8d8d008d, 0x0000727272720072, 0x0000b9b9b9b900b9, + 0x0000f8f8f8f800f8, 0x0000acacacac00ac, 0x0000363636360036, + 0x00002a2a2a2a002a, 0x00003c3c3c3c003c, 0x0000f1f1f1f100f1, + 0x0000404040400040, 0x0000d3d3d3d300d3, 0x0000bbbbbbbb00bb, + 0x0000434343430043, 0x0000151515150015, 0x0000adadadad00ad, + 0x0000777777770077, 0x0000808080800080, 0x0000828282820082, + 0x0000ecececec00ec, 0x0000272727270027, 0x0000e5e5e5e500e5, + 0x0000858585850085, 0x0000353535350035, 0x00000c0c0c0c000c, + 0x0000414141410041, 0x0000efefefef00ef, 0x0000939393930093, + 0x0000191919190019, 0x0000212121210021, 0x00000e0e0e0e000e, + 0x00004e4e4e4e004e, 0x0000656565650065, 0x0000bdbdbdbd00bd, + 0x0000b8b8b8b800b8, 0x00008f8f8f8f008f, 0x0000ebebebeb00eb, + 0x0000cececece00ce, 0x0000303030300030, 0x00005f5f5f5f005f, + 0x0000c5c5c5c500c5, 0x00001a1a1a1a001a, 0x0000e1e1e1e100e1, + 0x0000cacacaca00ca, 0x0000474747470047, 0x00003d3d3d3d003d, + 0x0000010101010001, 0x0000d6d6d6d600d6, 0x0000565656560056, + 0x00004d4d4d4d004d, 0x00000d0d0d0d000d, 0x0000666666660066, + 0x0000cccccccc00cc, 0x00002d2d2d2d002d, 0x0000121212120012, + 0x0000202020200020, 0x0000b1b1b1b100b1, 0x0000999999990099, + 0x00004c4c4c4c004c, 0x0000c2c2c2c200c2, 0x00007e7e7e7e007e, + 0x0000050505050005, 0x0000b7b7b7b700b7, 0x0000313131310031, + 0x0000171717170017, 0x0000d7d7d7d700d7, 0x0000585858580058, + 0x0000616161610061, 0x00001b1b1b1b001b, 0x00001c1c1c1c001c, + 0x00000f0f0f0f000f, 0x0000161616160016, 0x0000181818180018, + 0x0000222222220022, 0x0000444444440044, 0x0000b2b2b2b200b2, + 0x0000b5b5b5b500b5, 0x0000919191910091, 0x0000080808080008, + 0x0000a8a8a8a800a8, 0x0000fcfcfcfc00fc, 0x0000505050500050, + 0x0000d0d0d0d000d0, 0x00007d7d7d7d007d, 0x0000898989890089, + 0x0000979797970097, 0x00005b5b5b5b005b, 0x0000959595950095, + 0x0000ffffffff00ff, 0x0000d2d2d2d200d2, 0x0000c4c4c4c400c4, + 0x0000484848480048, 0x0000f7f7f7f700f7, 0x0000dbdbdbdb00db, + 0x0000030303030003, 0x0000dadadada00da, 0x00003f3f3f3f003f, + 0x0000949494940094, 0x00005c5c5c5c005c, 0x0000020202020002, + 0x00004a4a4a4a004a, 0x0000333333330033, 0x0000676767670067, + 0x0000f3f3f3f300f3, 0x00007f7f7f7f007f, 0x0000e2e2e2e200e2, + 0x00009b9b9b9b009b, 0x0000262626260026, 0x0000373737370037, + 0x00003b3b3b3b003b, 0x0000969696960096, 0x00004b4b4b4b004b, + 0x0000bebebebe00be, 0x00002e2e2e2e002e, 0x0000797979790079, + 0x00008c8c8c8c008c, 0x00006e6e6e6e006e, 0x00008e8e8e8e008e, + 0x0000f5f5f5f500f5, 0x0000b6b6b6b600b6, 0x0000fdfdfdfd00fd, + 0x0000595959590059, 0x0000989898980098, 0x00006a6a6a6a006a, + 0x0000464646460046, 0x0000babababa00ba, 0x0000252525250025, + 0x0000424242420042, 0x0000a2a2a2a200a2, 0x0000fafafafa00fa, + 0x0000070707070007, 0x0000555555550055, 0x0000eeeeeeee00ee, + 0x00000a0a0a0a000a, 0x0000494949490049, 0x0000686868680068, + 0x0000383838380038, 0x0000a4a4a4a400a4, 0x0000282828280028, + 0x00007b7b7b7b007b, 0x0000c9c9c9c900c9, 0x0000c1c1c1c100c1, + 0x0000e3e3e3e300e3, 0x0000f4f4f4f400f4, 0x0000c7c7c7c700c7, + 0x00009e9e9e9e009e, +}; + +const u64 camellia_sp02220222[256] = { + 0x00e0e0e000e0e0e0, 0x0005050500050505, 0x0058585800585858, + 0x00d9d9d900d9d9d9, 0x0067676700676767, 0x004e4e4e004e4e4e, + 0x0081818100818181, 0x00cbcbcb00cbcbcb, 0x00c9c9c900c9c9c9, + 0x000b0b0b000b0b0b, 0x00aeaeae00aeaeae, 0x006a6a6a006a6a6a, + 0x00d5d5d500d5d5d5, 0x0018181800181818, 0x005d5d5d005d5d5d, + 0x0082828200828282, 0x0046464600464646, 0x00dfdfdf00dfdfdf, + 0x00d6d6d600d6d6d6, 0x0027272700272727, 0x008a8a8a008a8a8a, + 0x0032323200323232, 0x004b4b4b004b4b4b, 0x0042424200424242, + 0x00dbdbdb00dbdbdb, 0x001c1c1c001c1c1c, 0x009e9e9e009e9e9e, + 0x009c9c9c009c9c9c, 0x003a3a3a003a3a3a, 0x00cacaca00cacaca, + 0x0025252500252525, 0x007b7b7b007b7b7b, 0x000d0d0d000d0d0d, + 0x0071717100717171, 0x005f5f5f005f5f5f, 0x001f1f1f001f1f1f, + 0x00f8f8f800f8f8f8, 0x00d7d7d700d7d7d7, 0x003e3e3e003e3e3e, + 0x009d9d9d009d9d9d, 0x007c7c7c007c7c7c, 0x0060606000606060, + 0x00b9b9b900b9b9b9, 0x00bebebe00bebebe, 0x00bcbcbc00bcbcbc, + 0x008b8b8b008b8b8b, 0x0016161600161616, 0x0034343400343434, + 0x004d4d4d004d4d4d, 0x00c3c3c300c3c3c3, 0x0072727200727272, + 0x0095959500959595, 0x00ababab00ababab, 0x008e8e8e008e8e8e, + 0x00bababa00bababa, 0x007a7a7a007a7a7a, 0x00b3b3b300b3b3b3, + 0x0002020200020202, 0x00b4b4b400b4b4b4, 0x00adadad00adadad, + 0x00a2a2a200a2a2a2, 0x00acacac00acacac, 0x00d8d8d800d8d8d8, + 0x009a9a9a009a9a9a, 0x0017171700171717, 0x001a1a1a001a1a1a, + 0x0035353500353535, 0x00cccccc00cccccc, 0x00f7f7f700f7f7f7, + 0x0099999900999999, 0x0061616100616161, 0x005a5a5a005a5a5a, + 0x00e8e8e800e8e8e8, 0x0024242400242424, 0x0056565600565656, + 0x0040404000404040, 0x00e1e1e100e1e1e1, 0x0063636300636363, + 0x0009090900090909, 0x0033333300333333, 0x00bfbfbf00bfbfbf, + 0x0098989800989898, 0x0097979700979797, 0x0085858500858585, + 0x0068686800686868, 0x00fcfcfc00fcfcfc, 0x00ececec00ececec, + 0x000a0a0a000a0a0a, 0x00dadada00dadada, 0x006f6f6f006f6f6f, + 0x0053535300535353, 0x0062626200626262, 0x00a3a3a300a3a3a3, + 0x002e2e2e002e2e2e, 0x0008080800080808, 0x00afafaf00afafaf, + 0x0028282800282828, 0x00b0b0b000b0b0b0, 0x0074747400747474, + 0x00c2c2c200c2c2c2, 0x00bdbdbd00bdbdbd, 0x0036363600363636, + 0x0022222200222222, 0x0038383800383838, 0x0064646400646464, + 0x001e1e1e001e1e1e, 0x0039393900393939, 0x002c2c2c002c2c2c, + 0x00a6a6a600a6a6a6, 0x0030303000303030, 0x00e5e5e500e5e5e5, + 0x0044444400444444, 0x00fdfdfd00fdfdfd, 0x0088888800888888, + 0x009f9f9f009f9f9f, 0x0065656500656565, 0x0087878700878787, + 0x006b6b6b006b6b6b, 0x00f4f4f400f4f4f4, 0x0023232300232323, + 0x0048484800484848, 0x0010101000101010, 0x00d1d1d100d1d1d1, + 0x0051515100515151, 0x00c0c0c000c0c0c0, 0x00f9f9f900f9f9f9, + 0x00d2d2d200d2d2d2, 0x00a0a0a000a0a0a0, 0x0055555500555555, + 0x00a1a1a100a1a1a1, 0x0041414100414141, 0x00fafafa00fafafa, + 0x0043434300434343, 0x0013131300131313, 0x00c4c4c400c4c4c4, + 0x002f2f2f002f2f2f, 0x00a8a8a800a8a8a8, 0x00b6b6b600b6b6b6, + 0x003c3c3c003c3c3c, 0x002b2b2b002b2b2b, 0x00c1c1c100c1c1c1, + 0x00ffffff00ffffff, 0x00c8c8c800c8c8c8, 0x00a5a5a500a5a5a5, + 0x0020202000202020, 0x0089898900898989, 0x0000000000000000, + 0x0090909000909090, 0x0047474700474747, 0x00efefef00efefef, + 0x00eaeaea00eaeaea, 0x00b7b7b700b7b7b7, 0x0015151500151515, + 0x0006060600060606, 0x00cdcdcd00cdcdcd, 0x00b5b5b500b5b5b5, + 0x0012121200121212, 0x007e7e7e007e7e7e, 0x00bbbbbb00bbbbbb, + 0x0029292900292929, 0x000f0f0f000f0f0f, 0x00b8b8b800b8b8b8, + 0x0007070700070707, 0x0004040400040404, 0x009b9b9b009b9b9b, + 0x0094949400949494, 0x0021212100212121, 0x0066666600666666, + 0x00e6e6e600e6e6e6, 0x00cecece00cecece, 0x00ededed00ededed, + 0x00e7e7e700e7e7e7, 0x003b3b3b003b3b3b, 0x00fefefe00fefefe, + 0x007f7f7f007f7f7f, 0x00c5c5c500c5c5c5, 0x00a4a4a400a4a4a4, + 0x0037373700373737, 0x00b1b1b100b1b1b1, 0x004c4c4c004c4c4c, + 0x0091919100919191, 0x006e6e6e006e6e6e, 0x008d8d8d008d8d8d, + 0x0076767600767676, 0x0003030300030303, 0x002d2d2d002d2d2d, + 0x00dedede00dedede, 0x0096969600969696, 0x0026262600262626, + 0x007d7d7d007d7d7d, 0x00c6c6c600c6c6c6, 0x005c5c5c005c5c5c, + 0x00d3d3d300d3d3d3, 0x00f2f2f200f2f2f2, 0x004f4f4f004f4f4f, + 0x0019191900191919, 0x003f3f3f003f3f3f, 0x00dcdcdc00dcdcdc, + 0x0079797900797979, 0x001d1d1d001d1d1d, 0x0052525200525252, + 0x00ebebeb00ebebeb, 0x00f3f3f300f3f3f3, 0x006d6d6d006d6d6d, + 0x005e5e5e005e5e5e, 0x00fbfbfb00fbfbfb, 0x0069696900696969, + 0x00b2b2b200b2b2b2, 0x00f0f0f000f0f0f0, 0x0031313100313131, + 0x000c0c0c000c0c0c, 0x00d4d4d400d4d4d4, 0x00cfcfcf00cfcfcf, + 0x008c8c8c008c8c8c, 0x00e2e2e200e2e2e2, 0x0075757500757575, + 0x00a9a9a900a9a9a9, 0x004a4a4a004a4a4a, 0x0057575700575757, + 0x0084848400848484, 0x0011111100111111, 0x0045454500454545, + 0x001b1b1b001b1b1b, 0x00f5f5f500f5f5f5, 0x00e4e4e400e4e4e4, + 0x000e0e0e000e0e0e, 0x0073737300737373, 0x00aaaaaa00aaaaaa, + 0x00f1f1f100f1f1f1, 0x00dddddd00dddddd, 0x0059595900595959, + 0x0014141400141414, 0x006c6c6c006c6c6c, 0x0092929200929292, + 0x0054545400545454, 0x00d0d0d000d0d0d0, 0x0078787800787878, + 0x0070707000707070, 0x00e3e3e300e3e3e3, 0x0049494900494949, + 0x0080808000808080, 0x0050505000505050, 0x00a7a7a700a7a7a7, + 0x00f6f6f600f6f6f6, 0x0077777700777777, 0x0093939300939393, + 0x0086868600868686, 0x0083838300838383, 0x002a2a2a002a2a2a, + 0x00c7c7c700c7c7c7, 0x005b5b5b005b5b5b, 0x00e9e9e900e9e9e9, + 0x00eeeeee00eeeeee, 0x008f8f8f008f8f8f, 0x0001010100010101, + 0x003d3d3d003d3d3d, +}; + +const u64 camellia_sp30333033[256] = { + 0x3800383838003838, 0x4100414141004141, 0x1600161616001616, + 0x7600767676007676, 0xd900d9d9d900d9d9, 0x9300939393009393, + 0x6000606060006060, 0xf200f2f2f200f2f2, 0x7200727272007272, + 0xc200c2c2c200c2c2, 0xab00ababab00abab, 0x9a009a9a9a009a9a, + 0x7500757575007575, 0x0600060606000606, 0x5700575757005757, + 0xa000a0a0a000a0a0, 0x9100919191009191, 0xf700f7f7f700f7f7, + 0xb500b5b5b500b5b5, 0xc900c9c9c900c9c9, 0xa200a2a2a200a2a2, + 0x8c008c8c8c008c8c, 0xd200d2d2d200d2d2, 0x9000909090009090, + 0xf600f6f6f600f6f6, 0x0700070707000707, 0xa700a7a7a700a7a7, + 0x2700272727002727, 0x8e008e8e8e008e8e, 0xb200b2b2b200b2b2, + 0x4900494949004949, 0xde00dedede00dede, 0x4300434343004343, + 0x5c005c5c5c005c5c, 0xd700d7d7d700d7d7, 0xc700c7c7c700c7c7, + 0x3e003e3e3e003e3e, 0xf500f5f5f500f5f5, 0x8f008f8f8f008f8f, + 0x6700676767006767, 0x1f001f1f1f001f1f, 0x1800181818001818, + 0x6e006e6e6e006e6e, 0xaf00afafaf00afaf, 0x2f002f2f2f002f2f, + 0xe200e2e2e200e2e2, 0x8500858585008585, 0x0d000d0d0d000d0d, + 0x5300535353005353, 0xf000f0f0f000f0f0, 0x9c009c9c9c009c9c, + 0x6500656565006565, 0xea00eaeaea00eaea, 0xa300a3a3a300a3a3, + 0xae00aeaeae00aeae, 0x9e009e9e9e009e9e, 0xec00ececec00ecec, + 0x8000808080008080, 0x2d002d2d2d002d2d, 0x6b006b6b6b006b6b, + 0xa800a8a8a800a8a8, 0x2b002b2b2b002b2b, 0x3600363636003636, + 0xa600a6a6a600a6a6, 0xc500c5c5c500c5c5, 0x8600868686008686, + 0x4d004d4d4d004d4d, 0x3300333333003333, 0xfd00fdfdfd00fdfd, + 0x6600666666006666, 0x5800585858005858, 0x9600969696009696, + 0x3a003a3a3a003a3a, 0x0900090909000909, 0x9500959595009595, + 0x1000101010001010, 0x7800787878007878, 0xd800d8d8d800d8d8, + 0x4200424242004242, 0xcc00cccccc00cccc, 0xef00efefef00efef, + 0x2600262626002626, 0xe500e5e5e500e5e5, 0x6100616161006161, + 0x1a001a1a1a001a1a, 0x3f003f3f3f003f3f, 0x3b003b3b3b003b3b, + 0x8200828282008282, 0xb600b6b6b600b6b6, 0xdb00dbdbdb00dbdb, + 0xd400d4d4d400d4d4, 0x9800989898009898, 0xe800e8e8e800e8e8, + 0x8b008b8b8b008b8b, 0x0200020202000202, 0xeb00ebebeb00ebeb, + 0x0a000a0a0a000a0a, 0x2c002c2c2c002c2c, 0x1d001d1d1d001d1d, + 0xb000b0b0b000b0b0, 0x6f006f6f6f006f6f, 0x8d008d8d8d008d8d, + 0x8800888888008888, 0x0e000e0e0e000e0e, 0x1900191919001919, + 0x8700878787008787, 0x4e004e4e4e004e4e, 0x0b000b0b0b000b0b, + 0xa900a9a9a900a9a9, 0x0c000c0c0c000c0c, 0x7900797979007979, + 0x1100111111001111, 0x7f007f7f7f007f7f, 0x2200222222002222, + 0xe700e7e7e700e7e7, 0x5900595959005959, 0xe100e1e1e100e1e1, + 0xda00dadada00dada, 0x3d003d3d3d003d3d, 0xc800c8c8c800c8c8, + 0x1200121212001212, 0x0400040404000404, 0x7400747474007474, + 0x5400545454005454, 0x3000303030003030, 0x7e007e7e7e007e7e, + 0xb400b4b4b400b4b4, 0x2800282828002828, 0x5500555555005555, + 0x6800686868006868, 0x5000505050005050, 0xbe00bebebe00bebe, + 0xd000d0d0d000d0d0, 0xc400c4c4c400c4c4, 0x3100313131003131, + 0xcb00cbcbcb00cbcb, 0x2a002a2a2a002a2a, 0xad00adadad00adad, + 0x0f000f0f0f000f0f, 0xca00cacaca00caca, 0x7000707070007070, + 0xff00ffffff00ffff, 0x3200323232003232, 0x6900696969006969, + 0x0800080808000808, 0x6200626262006262, 0x0000000000000000, + 0x2400242424002424, 0xd100d1d1d100d1d1, 0xfb00fbfbfb00fbfb, + 0xba00bababa00baba, 0xed00ededed00eded, 0x4500454545004545, + 0x8100818181008181, 0x7300737373007373, 0x6d006d6d6d006d6d, + 0x8400848484008484, 0x9f009f9f9f009f9f, 0xee00eeeeee00eeee, + 0x4a004a4a4a004a4a, 0xc300c3c3c300c3c3, 0x2e002e2e2e002e2e, + 0xc100c1c1c100c1c1, 0x0100010101000101, 0xe600e6e6e600e6e6, + 0x2500252525002525, 0x4800484848004848, 0x9900999999009999, + 0xb900b9b9b900b9b9, 0xb300b3b3b300b3b3, 0x7b007b7b7b007b7b, + 0xf900f9f9f900f9f9, 0xce00cecece00cece, 0xbf00bfbfbf00bfbf, + 0xdf00dfdfdf00dfdf, 0x7100717171007171, 0x2900292929002929, + 0xcd00cdcdcd00cdcd, 0x6c006c6c6c006c6c, 0x1300131313001313, + 0x6400646464006464, 0x9b009b9b9b009b9b, 0x6300636363006363, + 0x9d009d9d9d009d9d, 0xc000c0c0c000c0c0, 0x4b004b4b4b004b4b, + 0xb700b7b7b700b7b7, 0xa500a5a5a500a5a5, 0x8900898989008989, + 0x5f005f5f5f005f5f, 0xb100b1b1b100b1b1, 0x1700171717001717, + 0xf400f4f4f400f4f4, 0xbc00bcbcbc00bcbc, 0xd300d3d3d300d3d3, + 0x4600464646004646, 0xcf00cfcfcf00cfcf, 0x3700373737003737, + 0x5e005e5e5e005e5e, 0x4700474747004747, 0x9400949494009494, + 0xfa00fafafa00fafa, 0xfc00fcfcfc00fcfc, 0x5b005b5b5b005b5b, + 0x9700979797009797, 0xfe00fefefe00fefe, 0x5a005a5a5a005a5a, + 0xac00acacac00acac, 0x3c003c3c3c003c3c, 0x4c004c4c4c004c4c, + 0x0300030303000303, 0x3500353535003535, 0xf300f3f3f300f3f3, + 0x2300232323002323, 0xb800b8b8b800b8b8, 0x5d005d5d5d005d5d, + 0x6a006a6a6a006a6a, 0x9200929292009292, 0xd500d5d5d500d5d5, + 0x2100212121002121, 0x4400444444004444, 0x5100515151005151, + 0xc600c6c6c600c6c6, 0x7d007d7d7d007d7d, 0x3900393939003939, + 0x8300838383008383, 0xdc00dcdcdc00dcdc, 0xaa00aaaaaa00aaaa, + 0x7c007c7c7c007c7c, 0x7700777777007777, 0x5600565656005656, + 0x0500050505000505, 0x1b001b1b1b001b1b, 0xa400a4a4a400a4a4, + 0x1500151515001515, 0x3400343434003434, 0x1e001e1e1e001e1e, + 0x1c001c1c1c001c1c, 0xf800f8f8f800f8f8, 0x5200525252005252, + 0x2000202020002020, 0x1400141414001414, 0xe900e9e9e900e9e9, + 0xbd00bdbdbd00bdbd, 0xdd00dddddd00dddd, 0xe400e4e4e400e4e4, + 0xa100a1a1a100a1a1, 0xe000e0e0e000e0e0, 0x8a008a8a8a008a8a, + 0xf100f1f1f100f1f1, 0xd600d6d6d600d6d6, 0x7a007a7a7a007a7a, + 0xbb00bbbbbb00bbbb, 0xe300e3e3e300e3e3, 0x4000404040004040, + 0x4f004f4f4f004f4f, +}; + +const u64 camellia_sp44044404[256] = { + 0x7070007070700070, 0x2c2c002c2c2c002c, 0xb3b300b3b3b300b3, + 0xc0c000c0c0c000c0, 0xe4e400e4e4e400e4, 0x5757005757570057, + 0xeaea00eaeaea00ea, 0xaeae00aeaeae00ae, 0x2323002323230023, + 0x6b6b006b6b6b006b, 0x4545004545450045, 0xa5a500a5a5a500a5, + 0xeded00ededed00ed, 0x4f4f004f4f4f004f, 0x1d1d001d1d1d001d, + 0x9292009292920092, 0x8686008686860086, 0xafaf00afafaf00af, + 0x7c7c007c7c7c007c, 0x1f1f001f1f1f001f, 0x3e3e003e3e3e003e, + 0xdcdc00dcdcdc00dc, 0x5e5e005e5e5e005e, 0x0b0b000b0b0b000b, + 0xa6a600a6a6a600a6, 0x3939003939390039, 0xd5d500d5d5d500d5, + 0x5d5d005d5d5d005d, 0xd9d900d9d9d900d9, 0x5a5a005a5a5a005a, + 0x5151005151510051, 0x6c6c006c6c6c006c, 0x8b8b008b8b8b008b, + 0x9a9a009a9a9a009a, 0xfbfb00fbfbfb00fb, 0xb0b000b0b0b000b0, + 0x7474007474740074, 0x2b2b002b2b2b002b, 0xf0f000f0f0f000f0, + 0x8484008484840084, 0xdfdf00dfdfdf00df, 0xcbcb00cbcbcb00cb, + 0x3434003434340034, 0x7676007676760076, 0x6d6d006d6d6d006d, + 0xa9a900a9a9a900a9, 0xd1d100d1d1d100d1, 0x0404000404040004, + 0x1414001414140014, 0x3a3a003a3a3a003a, 0xdede00dedede00de, + 0x1111001111110011, 0x3232003232320032, 0x9c9c009c9c9c009c, + 0x5353005353530053, 0xf2f200f2f2f200f2, 0xfefe00fefefe00fe, + 0xcfcf00cfcfcf00cf, 0xc3c300c3c3c300c3, 0x7a7a007a7a7a007a, + 0x2424002424240024, 0xe8e800e8e8e800e8, 0x6060006060600060, + 0x6969006969690069, 0xaaaa00aaaaaa00aa, 0xa0a000a0a0a000a0, + 0xa1a100a1a1a100a1, 0x6262006262620062, 0x5454005454540054, + 0x1e1e001e1e1e001e, 0xe0e000e0e0e000e0, 0x6464006464640064, + 0x1010001010100010, 0x0000000000000000, 0xa3a300a3a3a300a3, + 0x7575007575750075, 0x8a8a008a8a8a008a, 0xe6e600e6e6e600e6, + 0x0909000909090009, 0xdddd00dddddd00dd, 0x8787008787870087, + 0x8383008383830083, 0xcdcd00cdcdcd00cd, 0x9090009090900090, + 0x7373007373730073, 0xf6f600f6f6f600f6, 0x9d9d009d9d9d009d, + 0xbfbf00bfbfbf00bf, 0x5252005252520052, 0xd8d800d8d8d800d8, + 0xc8c800c8c8c800c8, 0xc6c600c6c6c600c6, 0x8181008181810081, + 0x6f6f006f6f6f006f, 0x1313001313130013, 0x6363006363630063, + 0xe9e900e9e9e900e9, 0xa7a700a7a7a700a7, 0x9f9f009f9f9f009f, + 0xbcbc00bcbcbc00bc, 0x2929002929290029, 0xf9f900f9f9f900f9, + 0x2f2f002f2f2f002f, 0xb4b400b4b4b400b4, 0x7878007878780078, + 0x0606000606060006, 0xe7e700e7e7e700e7, 0x7171007171710071, + 0xd4d400d4d4d400d4, 0xabab00ababab00ab, 0x8888008888880088, + 0x8d8d008d8d8d008d, 0x7272007272720072, 0xb9b900b9b9b900b9, + 0xf8f800f8f8f800f8, 0xacac00acacac00ac, 0x3636003636360036, + 0x2a2a002a2a2a002a, 0x3c3c003c3c3c003c, 0xf1f100f1f1f100f1, + 0x4040004040400040, 0xd3d300d3d3d300d3, 0xbbbb00bbbbbb00bb, + 0x4343004343430043, 0x1515001515150015, 0xadad00adadad00ad, + 0x7777007777770077, 0x8080008080800080, 0x8282008282820082, + 0xecec00ececec00ec, 0x2727002727270027, 0xe5e500e5e5e500e5, + 0x8585008585850085, 0x3535003535350035, 0x0c0c000c0c0c000c, + 0x4141004141410041, 0xefef00efefef00ef, 0x9393009393930093, + 0x1919001919190019, 0x2121002121210021, 0x0e0e000e0e0e000e, + 0x4e4e004e4e4e004e, 0x6565006565650065, 0xbdbd00bdbdbd00bd, + 0xb8b800b8b8b800b8, 0x8f8f008f8f8f008f, 0xebeb00ebebeb00eb, + 0xcece00cecece00ce, 0x3030003030300030, 0x5f5f005f5f5f005f, + 0xc5c500c5c5c500c5, 0x1a1a001a1a1a001a, 0xe1e100e1e1e100e1, + 0xcaca00cacaca00ca, 0x4747004747470047, 0x3d3d003d3d3d003d, + 0x0101000101010001, 0xd6d600d6d6d600d6, 0x5656005656560056, + 0x4d4d004d4d4d004d, 0x0d0d000d0d0d000d, 0x6666006666660066, + 0xcccc00cccccc00cc, 0x2d2d002d2d2d002d, 0x1212001212120012, + 0x2020002020200020, 0xb1b100b1b1b100b1, 0x9999009999990099, + 0x4c4c004c4c4c004c, 0xc2c200c2c2c200c2, 0x7e7e007e7e7e007e, + 0x0505000505050005, 0xb7b700b7b7b700b7, 0x3131003131310031, + 0x1717001717170017, 0xd7d700d7d7d700d7, 0x5858005858580058, + 0x6161006161610061, 0x1b1b001b1b1b001b, 0x1c1c001c1c1c001c, + 0x0f0f000f0f0f000f, 0x1616001616160016, 0x1818001818180018, + 0x2222002222220022, 0x4444004444440044, 0xb2b200b2b2b200b2, + 0xb5b500b5b5b500b5, 0x9191009191910091, 0x0808000808080008, + 0xa8a800a8a8a800a8, 0xfcfc00fcfcfc00fc, 0x5050005050500050, + 0xd0d000d0d0d000d0, 0x7d7d007d7d7d007d, 0x8989008989890089, + 0x9797009797970097, 0x5b5b005b5b5b005b, 0x9595009595950095, + 0xffff00ffffff00ff, 0xd2d200d2d2d200d2, 0xc4c400c4c4c400c4, + 0x4848004848480048, 0xf7f700f7f7f700f7, 0xdbdb00dbdbdb00db, + 0x0303000303030003, 0xdada00dadada00da, 0x3f3f003f3f3f003f, + 0x9494009494940094, 0x5c5c005c5c5c005c, 0x0202000202020002, + 0x4a4a004a4a4a004a, 0x3333003333330033, 0x6767006767670067, + 0xf3f300f3f3f300f3, 0x7f7f007f7f7f007f, 0xe2e200e2e2e200e2, + 0x9b9b009b9b9b009b, 0x2626002626260026, 0x3737003737370037, + 0x3b3b003b3b3b003b, 0x9696009696960096, 0x4b4b004b4b4b004b, + 0xbebe00bebebe00be, 0x2e2e002e2e2e002e, 0x7979007979790079, + 0x8c8c008c8c8c008c, 0x6e6e006e6e6e006e, 0x8e8e008e8e8e008e, + 0xf5f500f5f5f500f5, 0xb6b600b6b6b600b6, 0xfdfd00fdfdfd00fd, + 0x5959005959590059, 0x9898009898980098, 0x6a6a006a6a6a006a, + 0x4646004646460046, 0xbaba00bababa00ba, 0x2525002525250025, + 0x4242004242420042, 0xa2a200a2a2a200a2, 0xfafa00fafafa00fa, + 0x0707000707070007, 0x5555005555550055, 0xeeee00eeeeee00ee, + 0x0a0a000a0a0a000a, 0x4949004949490049, 0x6868006868680068, + 0x3838003838380038, 0xa4a400a4a4a400a4, 0x2828002828280028, + 0x7b7b007b7b7b007b, 0xc9c900c9c9c900c9, 0xc1c100c1c1c100c1, + 0xe3e300e3e3e300e3, 0xf4f400f4f4f400f4, 0xc7c700c7c7c700c7, + 0x9e9e009e9e9e009e, +}; + +const u64 camellia_sp11101110[256] = { + 0x7070700070707000, 0x8282820082828200, 0x2c2c2c002c2c2c00, + 0xececec00ececec00, 0xb3b3b300b3b3b300, 0x2727270027272700, + 0xc0c0c000c0c0c000, 0xe5e5e500e5e5e500, 0xe4e4e400e4e4e400, + 0x8585850085858500, 0x5757570057575700, 0x3535350035353500, + 0xeaeaea00eaeaea00, 0x0c0c0c000c0c0c00, 0xaeaeae00aeaeae00, + 0x4141410041414100, 0x2323230023232300, 0xefefef00efefef00, + 0x6b6b6b006b6b6b00, 0x9393930093939300, 0x4545450045454500, + 0x1919190019191900, 0xa5a5a500a5a5a500, 0x2121210021212100, + 0xededed00ededed00, 0x0e0e0e000e0e0e00, 0x4f4f4f004f4f4f00, + 0x4e4e4e004e4e4e00, 0x1d1d1d001d1d1d00, 0x6565650065656500, + 0x9292920092929200, 0xbdbdbd00bdbdbd00, 0x8686860086868600, + 0xb8b8b800b8b8b800, 0xafafaf00afafaf00, 0x8f8f8f008f8f8f00, + 0x7c7c7c007c7c7c00, 0xebebeb00ebebeb00, 0x1f1f1f001f1f1f00, + 0xcecece00cecece00, 0x3e3e3e003e3e3e00, 0x3030300030303000, + 0xdcdcdc00dcdcdc00, 0x5f5f5f005f5f5f00, 0x5e5e5e005e5e5e00, + 0xc5c5c500c5c5c500, 0x0b0b0b000b0b0b00, 0x1a1a1a001a1a1a00, + 0xa6a6a600a6a6a600, 0xe1e1e100e1e1e100, 0x3939390039393900, + 0xcacaca00cacaca00, 0xd5d5d500d5d5d500, 0x4747470047474700, + 0x5d5d5d005d5d5d00, 0x3d3d3d003d3d3d00, 0xd9d9d900d9d9d900, + 0x0101010001010100, 0x5a5a5a005a5a5a00, 0xd6d6d600d6d6d600, + 0x5151510051515100, 0x5656560056565600, 0x6c6c6c006c6c6c00, + 0x4d4d4d004d4d4d00, 0x8b8b8b008b8b8b00, 0x0d0d0d000d0d0d00, + 0x9a9a9a009a9a9a00, 0x6666660066666600, 0xfbfbfb00fbfbfb00, + 0xcccccc00cccccc00, 0xb0b0b000b0b0b000, 0x2d2d2d002d2d2d00, + 0x7474740074747400, 0x1212120012121200, 0x2b2b2b002b2b2b00, + 0x2020200020202000, 0xf0f0f000f0f0f000, 0xb1b1b100b1b1b100, + 0x8484840084848400, 0x9999990099999900, 0xdfdfdf00dfdfdf00, + 0x4c4c4c004c4c4c00, 0xcbcbcb00cbcbcb00, 0xc2c2c200c2c2c200, + 0x3434340034343400, 0x7e7e7e007e7e7e00, 0x7676760076767600, + 0x0505050005050500, 0x6d6d6d006d6d6d00, 0xb7b7b700b7b7b700, + 0xa9a9a900a9a9a900, 0x3131310031313100, 0xd1d1d100d1d1d100, + 0x1717170017171700, 0x0404040004040400, 0xd7d7d700d7d7d700, + 0x1414140014141400, 0x5858580058585800, 0x3a3a3a003a3a3a00, + 0x6161610061616100, 0xdedede00dedede00, 0x1b1b1b001b1b1b00, + 0x1111110011111100, 0x1c1c1c001c1c1c00, 0x3232320032323200, + 0x0f0f0f000f0f0f00, 0x9c9c9c009c9c9c00, 0x1616160016161600, + 0x5353530053535300, 0x1818180018181800, 0xf2f2f200f2f2f200, + 0x2222220022222200, 0xfefefe00fefefe00, 0x4444440044444400, + 0xcfcfcf00cfcfcf00, 0xb2b2b200b2b2b200, 0xc3c3c300c3c3c300, + 0xb5b5b500b5b5b500, 0x7a7a7a007a7a7a00, 0x9191910091919100, + 0x2424240024242400, 0x0808080008080800, 0xe8e8e800e8e8e800, + 0xa8a8a800a8a8a800, 0x6060600060606000, 0xfcfcfc00fcfcfc00, + 0x6969690069696900, 0x5050500050505000, 0xaaaaaa00aaaaaa00, + 0xd0d0d000d0d0d000, 0xa0a0a000a0a0a000, 0x7d7d7d007d7d7d00, + 0xa1a1a100a1a1a100, 0x8989890089898900, 0x6262620062626200, + 0x9797970097979700, 0x5454540054545400, 0x5b5b5b005b5b5b00, + 0x1e1e1e001e1e1e00, 0x9595950095959500, 0xe0e0e000e0e0e000, + 0xffffff00ffffff00, 0x6464640064646400, 0xd2d2d200d2d2d200, + 0x1010100010101000, 0xc4c4c400c4c4c400, 0x0000000000000000, + 0x4848480048484800, 0xa3a3a300a3a3a300, 0xf7f7f700f7f7f700, + 0x7575750075757500, 0xdbdbdb00dbdbdb00, 0x8a8a8a008a8a8a00, + 0x0303030003030300, 0xe6e6e600e6e6e600, 0xdadada00dadada00, + 0x0909090009090900, 0x3f3f3f003f3f3f00, 0xdddddd00dddddd00, + 0x9494940094949400, 0x8787870087878700, 0x5c5c5c005c5c5c00, + 0x8383830083838300, 0x0202020002020200, 0xcdcdcd00cdcdcd00, + 0x4a4a4a004a4a4a00, 0x9090900090909000, 0x3333330033333300, + 0x7373730073737300, 0x6767670067676700, 0xf6f6f600f6f6f600, + 0xf3f3f300f3f3f300, 0x9d9d9d009d9d9d00, 0x7f7f7f007f7f7f00, + 0xbfbfbf00bfbfbf00, 0xe2e2e200e2e2e200, 0x5252520052525200, + 0x9b9b9b009b9b9b00, 0xd8d8d800d8d8d800, 0x2626260026262600, + 0xc8c8c800c8c8c800, 0x3737370037373700, 0xc6c6c600c6c6c600, + 0x3b3b3b003b3b3b00, 0x8181810081818100, 0x9696960096969600, + 0x6f6f6f006f6f6f00, 0x4b4b4b004b4b4b00, 0x1313130013131300, + 0xbebebe00bebebe00, 0x6363630063636300, 0x2e2e2e002e2e2e00, + 0xe9e9e900e9e9e900, 0x7979790079797900, 0xa7a7a700a7a7a700, + 0x8c8c8c008c8c8c00, 0x9f9f9f009f9f9f00, 0x6e6e6e006e6e6e00, + 0xbcbcbc00bcbcbc00, 0x8e8e8e008e8e8e00, 0x2929290029292900, + 0xf5f5f500f5f5f500, 0xf9f9f900f9f9f900, 0xb6b6b600b6b6b600, + 0x2f2f2f002f2f2f00, 0xfdfdfd00fdfdfd00, 0xb4b4b400b4b4b400, + 0x5959590059595900, 0x7878780078787800, 0x9898980098989800, + 0x0606060006060600, 0x6a6a6a006a6a6a00, 0xe7e7e700e7e7e700, + 0x4646460046464600, 0x7171710071717100, 0xbababa00bababa00, + 0xd4d4d400d4d4d400, 0x2525250025252500, 0xababab00ababab00, + 0x4242420042424200, 0x8888880088888800, 0xa2a2a200a2a2a200, + 0x8d8d8d008d8d8d00, 0xfafafa00fafafa00, 0x7272720072727200, + 0x0707070007070700, 0xb9b9b900b9b9b900, 0x5555550055555500, + 0xf8f8f800f8f8f800, 0xeeeeee00eeeeee00, 0xacacac00acacac00, + 0x0a0a0a000a0a0a00, 0x3636360036363600, 0x4949490049494900, + 0x2a2a2a002a2a2a00, 0x6868680068686800, 0x3c3c3c003c3c3c00, + 0x3838380038383800, 0xf1f1f100f1f1f100, 0xa4a4a400a4a4a400, + 0x4040400040404000, 0x2828280028282800, 0xd3d3d300d3d3d300, + 0x7b7b7b007b7b7b00, 0xbbbbbb00bbbbbb00, 0xc9c9c900c9c9c900, + 0x4343430043434300, 0xc1c1c100c1c1c100, 0x1515150015151500, + 0xe3e3e300e3e3e300, 0xadadad00adadad00, 0xf4f4f400f4f4f400, + 0x7777770077777700, 0xc7c7c700c7c7c700, 0x8080800080808000, + 0x9e9e9e009e9e9e00, +}; + +/* key constants */ +#define CAMELLIA_SIGMA1L (0xA09E667FL) +#define CAMELLIA_SIGMA1R (0x3BCC908BL) +#define CAMELLIA_SIGMA2L (0xB67AE858L) +#define CAMELLIA_SIGMA2R (0x4CAA73B2L) +#define CAMELLIA_SIGMA3L (0xC6EF372FL) +#define CAMELLIA_SIGMA3R (0xE94F82BEL) +#define CAMELLIA_SIGMA4L (0x54FF53A5L) +#define CAMELLIA_SIGMA4R (0xF1D36F1CL) +#define CAMELLIA_SIGMA5L (0x10E527FAL) +#define CAMELLIA_SIGMA5R (0xDE682D1DL) +#define CAMELLIA_SIGMA6L (0xB05688C2L) +#define CAMELLIA_SIGMA6R (0xB3E6C1FDL) + +/* macros */ +#define ROLDQ(l, r, bits) ({ \ + u64 t = l; \ + l = (l << bits) | (r >> (64 - bits)); \ + r = (r << bits) | (t >> (64 - bits)); \ +}) + +#define CAMELLIA_F(x, kl, kr, y) ({ \ + u64 ii = x ^ (((u64)kl << 32) | kr); \ + y = camellia_sp11101110[(uint8_t)ii]; \ + y ^= camellia_sp44044404[(uint8_t)(ii >> 8)]; \ + ii >>= 16; \ + y ^= camellia_sp30333033[(uint8_t)ii]; \ + y ^= camellia_sp02220222[(uint8_t)(ii >> 8)]; \ + ii >>= 16; \ + y ^= camellia_sp00444404[(uint8_t)ii]; \ + y ^= camellia_sp03303033[(uint8_t)(ii >> 8)]; \ + ii >>= 16; \ + y ^= camellia_sp22000222[(uint8_t)ii]; \ + y ^= camellia_sp10011110[(uint8_t)(ii >> 8)]; \ + y = ror64(y, 32); \ +}) + +#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32)) + +static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max) +{ + u64 kw4, tt; + u32 dw, tl, tr; + + /* absorb kw2 to other subkeys */ + /* round 2 */ + subRL[3] ^= subRL[1]; + /* round 4 */ + subRL[5] ^= subRL[1]; + /* round 6 */ + subRL[7] ^= subRL[1]; + + subRL[1] ^= (subRL[1] & ~subRL[9]) << 32; + /* modified for FLinv(kl2) */ + dw = (subRL[1] & subRL[9]) >> 32, + subRL[1] ^= rol32(dw, 1); + + /* round 8 */ + subRL[11] ^= subRL[1]; + /* round 10 */ + subRL[13] ^= subRL[1]; + /* round 12 */ + subRL[15] ^= subRL[1]; + + subRL[1] ^= (subRL[1] & ~subRL[17]) << 32; + /* modified for FLinv(kl4) */ + dw = (subRL[1] & subRL[17]) >> 32, + subRL[1] ^= rol32(dw, 1); + + /* round 14 */ + subRL[19] ^= subRL[1]; + /* round 16 */ + subRL[21] ^= subRL[1]; + /* round 18 */ + subRL[23] ^= subRL[1]; + + if (max == 24) { + /* kw3 */ + subRL[24] ^= subRL[1]; + + /* absorb kw4 to other subkeys */ + kw4 = subRL[25]; + } else { + subRL[1] ^= (subRL[1] & ~subRL[25]) << 32; + /* modified for FLinv(kl6) */ + dw = (subRL[1] & subRL[25]) >> 32, + subRL[1] ^= rol32(dw, 1); + + /* round 20 */ + subRL[27] ^= subRL[1]; + /* round 22 */ + subRL[29] ^= subRL[1]; + /* round 24 */ + subRL[31] ^= subRL[1]; + /* kw3 */ + subRL[32] ^= subRL[1]; + + /* absorb kw4 to other subkeys */ + kw4 = subRL[33]; + /* round 23 */ + subRL[30] ^= kw4; + /* round 21 */ + subRL[28] ^= kw4; + /* round 19 */ + subRL[26] ^= kw4; + + kw4 ^= (kw4 & ~subRL[24]) << 32; + /* modified for FL(kl5) */ + dw = (kw4 & subRL[24]) >> 32, + kw4 ^= rol32(dw, 1); + } + + /* round 17 */ + subRL[22] ^= kw4; + /* round 15 */ + subRL[20] ^= kw4; + /* round 13 */ + subRL[18] ^= kw4; + + kw4 ^= (kw4 & ~subRL[16]) << 32; + /* modified for FL(kl3) */ + dw = (kw4 & subRL[16]) >> 32, + kw4 ^= rol32(dw, 1); + + /* round 11 */ + subRL[14] ^= kw4; + /* round 9 */ + subRL[12] ^= kw4; + /* round 7 */ + subRL[10] ^= kw4; + + kw4 ^= (kw4 & ~subRL[8]) << 32; + /* modified for FL(kl1) */ + dw = (kw4 & subRL[8]) >> 32, + kw4 ^= rol32(dw, 1); + + /* round 5 */ + subRL[6] ^= kw4; + /* round 3 */ + subRL[4] ^= kw4; + /* round 1 */ + subRL[2] ^= kw4; + /* kw1 */ + subRL[0] ^= kw4; + + /* key XOR is end of F-function */ + SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]); /* kw1 */ + SET_SUBKEY_LR(2, subRL[3]); /* round 1 */ + SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]); /* round 2 */ + SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]); /* round 3 */ + SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]); /* round 4 */ + SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */ + + tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]); + dw = tl & (subRL[8] >> 32), /* FL(kl1) */ + tr = subRL[10] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */ + SET_SUBKEY_LR(8, subRL[8]); /* FL(kl1) */ + SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */ + + tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]); + dw = tl & (subRL[9] >> 32), /* FLinv(kl2) */ + tr = subRL[7] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */ + SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]); /* round 8 */ + SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]); /* round 9 */ + SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]); /* round 10 */ + SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */ + + tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]); + dw = tl & (subRL[16] >> 32), /* FL(kl3) */ + tr = subRL[18] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */ + SET_SUBKEY_LR(16, subRL[16]); /* FL(kl3) */ + SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */ + + tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]); + dw = tl & (subRL[17] >> 32), /* FLinv(kl4) */ + tr = subRL[15] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */ + SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]); /* round 14 */ + SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]); /* round 15 */ + SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]); /* round 16 */ + SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]); /* round 17 */ + + if (max == 24) { + SET_SUBKEY_LR(23, subRL[22]); /* round 18 */ + SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */ + } else { + tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]); + dw = tl & (subRL[24] >> 32), /* FL(kl5) */ + tr = subRL[26] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */ + SET_SUBKEY_LR(24, subRL[24]); /* FL(kl5) */ + SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */ + + tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]); + dw = tl & (subRL[25] >> 32), /* FLinv(kl6) */ + tr = subRL[23] ^ rol32(dw, 1); + tt = (tr | ((u64)tl << 32)); + + SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */ + SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]); /* round 20 */ + SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]); /* round 21 */ + SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]); /* round 22 */ + SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]); /* round 23 */ + SET_SUBKEY_LR(31, subRL[30]); /* round 24 */ + SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]); /* kw3 */ + } +} + +static void camellia_setup128(const unsigned char *key, u64 *subkey) +{ + u64 kl, kr, ww; + u64 subRL[26]; + + /** + * k == kl || kr (|| is concatenation) + */ + kl = get_unaligned_be64(key); + kr = get_unaligned_be64(key + 8); + + /* generate KL dependent subkeys */ + /* kw1 */ + subRL[0] = kl; + /* kw2 */ + subRL[1] = kr; + + /* rotation left shift 15bit */ + ROLDQ(kl, kr, 15); + + /* k3 */ + subRL[4] = kl; + /* k4 */ + subRL[5] = kr; + + /* rotation left shift 15+30bit */ + ROLDQ(kl, kr, 30); + + /* k7 */ + subRL[10] = kl; + /* k8 */ + subRL[11] = kr; + + /* rotation left shift 15+30+15bit */ + ROLDQ(kl, kr, 15); + + /* k10 */ + subRL[13] = kr; + /* rotation left shift 15+30+15+17 bit */ + ROLDQ(kl, kr, 17); + + /* kl3 */ + subRL[16] = kl; + /* kl4 */ + subRL[17] = kr; + + /* rotation left shift 15+30+15+17+17 bit */ + ROLDQ(kl, kr, 17); + + /* k13 */ + subRL[18] = kl; + /* k14 */ + subRL[19] = kr; + + /* rotation left shift 15+30+15+17+17+17 bit */ + ROLDQ(kl, kr, 17); + + /* k17 */ + subRL[22] = kl; + /* k18 */ + subRL[23] = kr; + + /* generate KA */ + kl = subRL[0]; + kr = subRL[1]; + CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww); + kr ^= ww; + CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl); + + /* current status == (kll, klr, w0, w1) */ + CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr); + kr ^= ww; + CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww); + kl ^= ww; + + /* generate KA dependent subkeys */ + /* k1, k2 */ + subRL[2] = kl; + subRL[3] = kr; + ROLDQ(kl, kr, 15); + /* k5,k6 */ + subRL[6] = kl; + subRL[7] = kr; + ROLDQ(kl, kr, 15); + /* kl1, kl2 */ + subRL[8] = kl; + subRL[9] = kr; + ROLDQ(kl, kr, 15); + /* k9 */ + subRL[12] = kl; + ROLDQ(kl, kr, 15); + /* k11, k12 */ + subRL[14] = kl; + subRL[15] = kr; + ROLDQ(kl, kr, 34); + /* k15, k16 */ + subRL[20] = kl; + subRL[21] = kr; + ROLDQ(kl, kr, 17); + /* kw3, kw4 */ + subRL[24] = kl; + subRL[25] = kr; + + camellia_setup_tail(subkey, subRL, 24); +} + +static void camellia_setup256(const unsigned char *key, u64 *subkey) +{ + u64 kl, kr; /* left half of key */ + u64 krl, krr; /* right half of key */ + u64 ww; /* temporary variables */ + u64 subRL[34]; + + /** + * key = (kl || kr || krl || krr) (|| is concatenation) + */ + kl = get_unaligned_be64(key); + kr = get_unaligned_be64(key + 8); + krl = get_unaligned_be64(key + 16); + krr = get_unaligned_be64(key + 24); + + /* generate KL dependent subkeys */ + /* kw1 */ + subRL[0] = kl; + /* kw2 */ + subRL[1] = kr; + ROLDQ(kl, kr, 45); + /* k9 */ + subRL[12] = kl; + /* k10 */ + subRL[13] = kr; + ROLDQ(kl, kr, 15); + /* kl3 */ + subRL[16] = kl; + /* kl4 */ + subRL[17] = kr; + ROLDQ(kl, kr, 17); + /* k17 */ + subRL[22] = kl; + /* k18 */ + subRL[23] = kr; + ROLDQ(kl, kr, 34); + /* k23 */ + subRL[30] = kl; + /* k24 */ + subRL[31] = kr; + + /* generate KR dependent subkeys */ + ROLDQ(krl, krr, 15); + /* k3 */ + subRL[4] = krl; + /* k4 */ + subRL[5] = krr; + ROLDQ(krl, krr, 15); + /* kl1 */ + subRL[8] = krl; + /* kl2 */ + subRL[9] = krr; + ROLDQ(krl, krr, 30); + /* k13 */ + subRL[18] = krl; + /* k14 */ + subRL[19] = krr; + ROLDQ(krl, krr, 34); + /* k19 */ + subRL[26] = krl; + /* k20 */ + subRL[27] = krr; + ROLDQ(krl, krr, 34); + + /* generate KA */ + kl = subRL[0] ^ krl; + kr = subRL[1] ^ krr; + + CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww); + kr ^= ww; + CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl); + kl ^= krl; + CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr); + kr ^= ww ^ krr; + CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww); + kl ^= ww; + + /* generate KB */ + krl ^= kl; + krr ^= kr; + CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww); + krr ^= ww; + CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww); + krl ^= ww; + + /* generate KA dependent subkeys */ + ROLDQ(kl, kr, 15); + /* k5 */ + subRL[6] = kl; + /* k6 */ + subRL[7] = kr; + ROLDQ(kl, kr, 30); + /* k11 */ + subRL[14] = kl; + /* k12 */ + subRL[15] = kr; + /* rotation left shift 32bit */ + ROLDQ(kl, kr, 32); + /* kl5 */ + subRL[24] = kl; + /* kl6 */ + subRL[25] = kr; + /* rotation left shift 17 from k11,k12 -> k21,k22 */ + ROLDQ(kl, kr, 17); + /* k21 */ + subRL[28] = kl; + /* k22 */ + subRL[29] = kr; + + /* generate KB dependent subkeys */ + /* k1 */ + subRL[2] = krl; + /* k2 */ + subRL[3] = krr; + ROLDQ(krl, krr, 30); + /* k7 */ + subRL[10] = krl; + /* k8 */ + subRL[11] = krr; + ROLDQ(krl, krr, 30); + /* k15 */ + subRL[20] = krl; + /* k16 */ + subRL[21] = krr; + ROLDQ(krl, krr, 51); + /* kw3 */ + subRL[32] = krl; + /* kw4 */ + subRL[33] = krr; + + camellia_setup_tail(subkey, subRL, 32); +} + +static void camellia_setup192(const unsigned char *key, u64 *subkey) +{ + unsigned char kk[32]; + u64 krl, krr; + + memcpy(kk, key, 24); + memcpy((unsigned char *)&krl, key+16, 8); + krr = ~krl; + memcpy(kk+24, (unsigned char *)&krr, 8); + camellia_setup256(kk, subkey); +} + +static int __camellia_setkey(struct camellia_ctx *cctx, + const unsigned char *key, + unsigned int key_len, u32 *flags) +{ + if (key_len != 16 && key_len != 24 && key_len != 32) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + cctx->key_length = key_len; + + switch (key_len) { + case 16: + camellia_setup128(key, cctx->key_table); + break; + case 24: + camellia_setup192(key, cctx->key_table); + break; + case 32: + camellia_setup256(key, cctx->key_table); + break; + } + + return 0; +} + +static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, + &tfm->crt_flags); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + void (*fn)(struct camellia_ctx *, u8 *, const u8 *), + void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *)) +{ + struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = CAMELLIA_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process two block batch */ + if (nbytes >= bsize * 2) { + do { + fn_2way(ctx, wdst, wsrc); + + wsrc += bsize * 2; + wdst += bsize * 2; + nbytes -= bsize * 2; + } while (nbytes >= bsize * 2); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + fn(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = CAMELLIA_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = CAMELLIA_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[2 - 1]; + u128 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process two block batch */ + if (nbytes >= bsize * 2) { + do { + nbytes -= bsize * (2 - 1); + src -= 2 - 1; + dst -= 2 - 1; + + ivs[0] = src[0]; + + camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); + + u128_xor(dst + 1, dst + 1, ivs + 0); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 2); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 keystream[CAMELLIA_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + u128 ctrblk; + + memcpy(keystream, src, nbytes); + camellia_enc_blk_xor(ctx, keystream, walk->iv); + memcpy(dst, keystream, nbytes); + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + u128_inc(&ctrblk); + u128_to_be128((be128 *)walk->iv, &ctrblk); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = CAMELLIA_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[2]; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process two block batch */ + if (nbytes >= bsize * 2) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + } + + /* create ctrblks for parallel encrypt */ + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[1], &ctrblk); + u128_inc(&ctrblk); + + camellia_enc_blk_xor_2way(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 2; + dst += 2; + nbytes -= bsize * 2; + } while (nbytes >= bsize * 2); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct camellia_ctx *ctx = priv; + int i; + + while (nbytes >= 2 * bsize) { + camellia_enc_blk_2way(ctx, srcdst, srcdst); + srcdst += bsize * 2; + nbytes -= bsize * 2; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_enc_blk(ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct camellia_ctx *ctx = priv; + int i; + + while (nbytes >= 2 * bsize) { + camellia_dec_blk_2way(ctx, srcdst, srcdst); + srcdst += bsize * 2; + nbytes -= bsize * 2; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_dec_blk(ctx, srcdst, srcdst); +} + +struct camellia_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct camellia_ctx camellia_ctx; +}; + +static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __camellia_setkey(&ctx->camellia_ctx, key, + keylen - CAMELLIA_BLOCK_SIZE, + &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, + key + keylen - CAMELLIA_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[2 * 4]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->camellia_ctx, + .crypt_fn = encrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[2 * 4]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->camellia_ctx, + .crypt_fn = decrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct camellia_xts_ctx { + struct camellia_ctx tweak_ctx; + struct camellia_ctx crypt_ctx; +}; + +static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[2 * 4]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = encrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[2 * 4]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = decrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static struct crypto_alg camellia_algs[6] = { { + .cra_name = "camellia", + .cra_driver_name = "camellia-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[0].cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE, + .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE, + .cia_setkey = camellia_setkey, + .cia_encrypt = camellia_encrypt, + .cia_decrypt = camellia_decrypt + } + } +}, { + .cra_name = "ecb(camellia)", + .cra_driver_name = "ecb-camellia-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(camellia)", + .cra_driver_name = "cbc-camellia-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ctr(camellia)", + .cra_driver_name = "ctr-camellia-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[3].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "lrw(camellia)", + .cra_driver_name = "lrw-camellia-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[4].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = lrw_camellia_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "xts(camellia)", + .cra_driver_name = "xts-camellia-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(camellia_algs[5].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = xts_camellia_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +} }; + +static bool is_blacklisted_cpu(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (boot_cpu_data.x86 == 0x0f) { + /* + * On Pentium 4, camellia-asm is slower than original assembler + * implementation because excessive uses of 64bit rotate and + * left-shifts (which are really slow on P4) needed to store and + * handle 128bit block in two 64bit registers. + */ + return true; + } + + return false; +} + +static int force; +module_param(force, int, 0); +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); + +int __init init(void) +{ + if (!force && is_blacklisted_cpu()) { + printk(KERN_INFO + "camellia-x86_64: performance on this CPU " + "would be suboptimal: disabling " + "camellia-x86_64.\n"); + return -ENODEV; + } + + return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); +} + +void __exit fini(void) +{ + crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized"); +MODULE_ALIAS("camellia"); +MODULE_ALIAS("camellia-asm"); diff --git a/crypto/Kconfig b/crypto/Kconfig index e6cfe1a25137..6318edd6a457 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -654,6 +654,24 @@ config CRYPTO_CAMELLIA See also: +config CRYPTO_CAMELLIA_X86_64 + tristate "Camellia cipher algorithm (x86_64)" + depends on (X86 || UML_X86) && 64BIT + depends on CRYPTO + select CRYPTO_ALGAPI + select CRYPTO_LRW + select CRYPTO_XTS + help + Camellia cipher algorithm module (x86_64). + + Camellia is a symmetric key block cipher developed jointly + at NTT and Mitsubishi Electric Corporation. + + The Camellia specifies three key sizes: 128, 192 and 256 bits. + + See also: + + config CRYPTO_CAST5 tristate "CAST5 (CAST-128) cipher algorithm" select CRYPTO_ALGAPI -- cgit v1.2.3 From 31796ac4e8f0e88f5c10f1ad6dab8f19bebe44a4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 14 Mar 2012 14:27:52 -0700 Subject: x32: Fix alignment fail in struct compat_siginfo Adding struct _sigchld_x32 caused a misalignment cascade in struct siginfo, because union _sifields is located on an 4-byte boundary (8-byte misaligned.) Adding new fields that are 8-byte aligned caused the intermediate structures to also be aligned to 8 bytes, thereby adding padding in unexpected places. Thus, change s64 to compat_s64 here, which makes it "misaligned on paper". In reality these fields *are* actually aligned (there are 3 preceeding ints outside the union and 3 inside struct _sigchld_x32), but because of the intervening union and struct it is not possible for gcc to avoid padding without breaking the ABI. Reported-and-tested-by: H. J. Lu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/include/asm/ia32.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 7d0c18587709..ee52760549f0 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -130,8 +130,8 @@ typedef struct compat_siginfo { unsigned int _pid; /* which child */ unsigned int _uid; /* sender's uid */ int _status; /* exit code */ - s64 _utime; - s64 _stime; + compat_s64 _utime; + compat_s64 _stime; } _sigchld_x32; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ -- cgit v1.2.3 From 48b25c43e6eebb6c0edf72935e8720385beca76b Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Thu, 15 Mar 2012 13:13:38 -0400 Subject: [PATCH v3] ipc: provide generic compat versions of IPC syscalls When using the "compat" APIs, architectures will generally want to be able to make direct syscalls to msgsnd(), shmctl(), etc., and in the kernel we would want them to be handled directly by compat_sys_xxx() functions, as is true for other compat syscalls. However, for historical reasons, several of the existing compat IPC syscalls do not do this. semctl() expects a pointer to the fourth argument, instead of the fourth argument itself. msgsnd(), msgrcv() and shmat() expect arguments in different order. This change adds an ARCH_WANT_OLD_COMPAT_IPC config option that can be set to preserve this behavior for ports that use it (x86, sparc, powerpc, s390, and mips). No actual semantics are changed for those architectures, and there is only a minimal amount of code refactoring in ipc/compat.c. Newer architectures like tile (and perhaps future architectures such as arm64 and unicore64) should not select this option, and thus can avoid having any IPC-specific code at all in their architecture-specific compat layer. In the same vein, if this option is not selected, IPC_64 mode is assumed, since that's what the headers expect. The workaround code in "tile" for msgsnd() and msgrcv() is removed with this change; it also fixes the bug that shmat() and semctl() were not being properly handled. Reviewed-by: Arnd Bergmann Signed-off-by: Chris Metcalf --- arch/Kconfig | 3 ++ arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/include/asm/compat.h | 11 ------- arch/tile/kernel/compat.c | 43 -------------------------- arch/x86/Kconfig | 1 + include/linux/compat.h | 12 +++++++- ipc/compat.c | 70 ++++++++++++++++++++++++++++++++++++++---- 10 files changed, 83 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..b37f8f3ffa54 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -199,4 +199,7 @@ config HAVE_CMPXCHG_LOCAL config HAVE_CMPXCHG_DOUBLE bool +config ARCH_WANT_OLD_COMPAT_IPC + bool + source "kernel/gcov/Kconfig" diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5ab6e89603c5..4bbbb40f352a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2456,6 +2456,7 @@ config MIPS32_COMPAT config COMPAT bool depends on MIPS32_COMPAT + select ARCH_WANT_OLD_COMPAT_IPC default y config SYSVIPC_COMPAT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1919634a9b32..48ab0bb38924 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -152,6 +152,7 @@ config COMPAT bool default y if PPC64 select COMPAT_BINFMT_ELF + select ARCH_WANT_OLD_COMPAT_IPC config SYSVIPC_COMPAT bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 6d99a5fcc090..0ff53e350092 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -218,6 +218,7 @@ config COMPAT prompt "Kernel support for 31 bit emulation" depends on 64BIT select COMPAT_BINFMT_ELF + select ARCH_WANT_OLD_COMPAT_IPC help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ca5580e4d813..64e1a8e7cab3 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -576,6 +576,7 @@ config COMPAT depends on SPARC64 default y select COMPAT_BINFMT_ELF + select ARCH_WANT_OLD_COMPAT_IPC config SYSVIPC_COMPAT bool diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h index bf95f55b82b0..4b4b28969a65 100644 --- a/arch/tile/include/asm/compat.h +++ b/arch/tile/include/asm/compat.h @@ -242,17 +242,6 @@ long compat_sys_fallocate(int fd, int mode, long compat_sys_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval); -/* Versions of compat functions that differ from generic Linux. */ -struct compat_msgbuf; -long tile_compat_sys_msgsnd(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, int msgflg); -long tile_compat_sys_msgrcv(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, long msgtyp, int msgflg); -long tile_compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data); - /* Tilera Linux syscalls that don't have "compat" versions. */ #define compat_sys_flush_cache sys_flush_cache diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c index bf5e9d70266c..d67459b9ac2a 100644 --- a/arch/tile/kernel/compat.c +++ b/arch/tile/kernel/compat.c @@ -16,7 +16,6 @@ #define __SYSCALL_COMPAT #include -#include #include #include #include @@ -95,52 +94,10 @@ long compat_sys_sched_rr_get_interval(compat_pid_t pid, return ret; } -/* - * The usual compat_sys_msgsnd() and _msgrcv() seem to be assuming - * some different calling convention than our normal 32-bit tile code. - */ - -/* Already defined in ipc/compat.c, but we need it here. */ -struct compat_msgbuf { - compat_long_t mtype; - char mtext[1]; -}; - -long tile_compat_sys_msgsnd(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, int msgflg) -{ - compat_long_t mtype; - - if (get_user(mtype, &msgp->mtype)) - return -EFAULT; - return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); -} - -long tile_compat_sys_msgrcv(int msqid, - struct compat_msgbuf __user *msgp, - size_t msgsz, long msgtyp, int msgflg) -{ - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; - out: - return err; -} - /* Provide the compat syscall number to call mapping. */ #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), -/* The generic versions of these don't work for Tile. */ -#define compat_sys_msgrcv tile_compat_sys_msgrcv -#define compat_sys_msgsnd tile_compat_sys_msgsnd - /* See comments in sys.c */ #define compat_sys_fadvise64_64 sys32_fadvise64_64 #define compat_sys_readahead sys32_readahead diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..cde163dc6058 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2178,6 +2178,7 @@ config IA32_AOUT config COMPAT def_bool y depends on IA32_EMULATION + select ARCH_WANT_OLD_COMPAT_IPC config COMPAT_FOR_U64_ALIGNMENT def_bool COMPAT diff --git a/include/linux/compat.h b/include/linux/compat.h index 7e05fcee75a1..35c2dbf2448a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -224,6 +224,7 @@ struct compat_sysinfo; struct compat_sysctl_args; struct compat_kexec_segment; struct compat_mq_attr; +struct compat_msgbuf; extern void compat_exit_robust_list(struct task_struct *curr); @@ -234,13 +235,22 @@ asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); +#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC long compat_sys_semctl(int first, int second, int third, void __user *uptr); long compat_sys_msgsnd(int first, int second, int third, void __user *uptr); long compat_sys_msgrcv(int first, int second, int msgtyp, int third, int version, void __user *uptr); -long compat_sys_msgctl(int first, int second, void __user *uptr); long compat_sys_shmat(int first, int second, compat_uptr_t third, int version, void __user *uptr); +#else +long compat_sys_semctl(int semid, int semnum, int cmd, int arg); +long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp, + size_t msgsz, int msgflg); +long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp, + size_t msgsz, long msgtyp, int msgflg); +long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); +#endif +long compat_sys_msgctl(int first, int second, void __user *uptr); long compat_sys_shmctl(int first, int second, void __user *uptr); long compat_sys_semtimedop(int semid, struct sembuf __user *tsems, unsigned nsems, const struct compat_timespec __user *timeout); diff --git a/ipc/compat.c b/ipc/compat.c index 845a28738d3a..a6df704f521e 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -117,6 +118,7 @@ extern int sem_ctls[]; static inline int compat_ipc_parse_version(int *cmd) { +#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC int version = *cmd & IPC_64; /* this is tricky: architectures that have support for the old @@ -128,6 +130,10 @@ static inline int compat_ipc_parse_version(int *cmd) *cmd &= ~IPC_64; #endif return version; +#else + /* With the asm-generic APIs, we always use the 64-bit versions. */ + return IPC_64; +#endif } static inline int __get_compat_ipc64_perm(struct ipc64_perm *p64, @@ -232,10 +238,9 @@ static inline int put_compat_semid_ds(struct semid64_ds *s, return err; } -long compat_sys_semctl(int first, int second, int third, void __user *uptr) +static long do_compat_semctl(int first, int second, int third, u32 pad) { union semun fourth; - u32 pad; int err, err2; struct semid64_ds s64; struct semid64_ds __user *up64; @@ -243,10 +248,6 @@ long compat_sys_semctl(int first, int second, int third, void __user *uptr) memset(&s64, 0, sizeof(s64)); - if (!uptr) - return -EINVAL; - if (get_user(pad, (u32 __user *) uptr)) - return -EFAULT; if ((third & (~IPC_64)) == SETVAL) fourth.val = (int) pad; else @@ -305,6 +306,18 @@ long compat_sys_semctl(int first, int second, int third, void __user *uptr) return err; } +#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC +long compat_sys_semctl(int first, int second, int third, void __user *uptr) +{ + u32 pad; + + if (!uptr) + return -EINVAL; + if (get_user(pad, (u32 __user *) uptr)) + return -EFAULT; + return do_compat_semctl(first, second, third, pad); +} + long compat_sys_msgsnd(int first, int second, int third, void __user *uptr) { struct compat_msgbuf __user *up = uptr; @@ -353,6 +366,37 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third, out: return err; } +#else +long compat_sys_semctl(int semid, int semnum, int cmd, int arg) +{ + return do_compat_semctl(semid, semnum, cmd, arg); +} + +long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp, + size_t msgsz, int msgflg) +{ + compat_long_t mtype; + + if (get_user(mtype, &msgp->mtype)) + return -EFAULT; + return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); +} + +long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp, + size_t msgsz, long msgtyp, int msgflg) +{ + long err, mtype; + + err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); + if (err < 0) + goto out; + + if (put_user(mtype, &msgp->mtype)) + err = -EFAULT; + out: + return err; +} +#endif static inline int get_compat_msqid64(struct msqid64_ds *m64, struct compat_msqid64_ds __user *up64) @@ -470,6 +514,7 @@ long compat_sys_msgctl(int first, int second, void __user *uptr) return err; } +#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC long compat_sys_shmat(int first, int second, compat_uptr_t third, int version, void __user *uptr) { @@ -485,6 +530,19 @@ long compat_sys_shmat(int first, int second, compat_uptr_t third, int version, uaddr = compat_ptr(third); return put_user(raddr, uaddr); } +#else +long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg) +{ + unsigned long ret; + long err; + + err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret); + if (err) + return err; + force_successful_syscall_return(); + return (long)ret; +} +#endif static inline int get_compat_shmid64_ds(struct shmid64_ds *s64, struct compat_shmid64_ds __user *up64) -- cgit v1.2.3 From a939e817aa7e199d2fff05a67cb745be32dd5c2d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 1 Mar 2012 22:11:09 -0800 Subject: time: x86: Fix race switching from vsyscall to non-vsyscall clock When switching from a vsyscall capable to a non-vsyscall capable clocksource, there was a small race, where the last vsyscall gettimeofday before the switch might return a invalid time value using the new non-vsyscall enabled clocksource values after the switch is complete. This is due to the vsyscall code checking the vclock_mode once outside of the seqcount protected section. After it reads the vclock mode, it doesn't re-check that the sampled clock data that is obtained in the seqcount critical section still matches. The fix is to sample vclock_mode inside the protected section, and as long as it isn't VCLOCK_NONE, return the calculated value. If it has changed and is now VCLOCK_NONE, fall back to the syscall gettime calculation. v2: * Cleanup checks as suggested by tglx * Also fix same issue present in gettimeofday path CC: Andy Lutomirski CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/vdso/vclock_gettime.c | 72 +++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6bc0e723b6e8..7eeb1f6188ee 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -70,14 +70,26 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + + notrace static inline long vgetns(void) { long v; cycles_t cycles; if (gtod->clock.vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); - else + else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); + else + return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; return (v * gtod->clock.mult) >> gtod->clock.shift; } @@ -85,21 +97,28 @@ notrace static inline long vgetns(void) notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; + int mode; + do { seq = read_seqbegin(>od->lock); + mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); } while (unlikely(read_seqretry(>od->lock, seq))); + timespec_add_ns(ts, ns); - return 0; + return mode; } notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; + int mode; + do { seq = read_seqbegin(>od->lock); + mode = gtod->clock.vclock_mode; secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; @@ -116,7 +135,7 @@ notrace static noinline int do_monotonic(struct timespec *ts) ts->tv_sec = secs; ts->tv_nsec = ns; - return 0; + return mode; } notrace static noinline int do_realtime_coarse(struct timespec *ts) @@ -156,14 +175,14 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { + int ret = VCLOCK_NONE; + switch (clock) { case CLOCK_REALTIME: - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) - return do_realtime(ts); + ret = do_realtime(ts); break; case CLOCK_MONOTONIC: - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) - return do_monotonic(ts); + ret = do_monotonic(ts); break; case CLOCK_REALTIME_COARSE: return do_realtime_coarse(ts); @@ -171,32 +190,33 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) return do_monotonic_coarse(ts); } - return vdso_fallback_gettime(clock, ts); + if (ret == VCLOCK_NONE) + return vdso_fallback_gettime(clock, ts); + return 0; } int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { - long ret; - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) { - if (likely(tv != NULL)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || - sizeof(*tv) != sizeof(struct timespec)); - do_realtime((struct timespec *)tv); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - /* Avoid memcpy. Some old compilers fail to inline it */ - tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; - tz->tz_dsttime = gtod->sys_tz.tz_dsttime; - } - return 0; + long ret = VCLOCK_NONE; + + if (likely(tv != NULL)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || + sizeof(*tv) != sizeof(struct timespec)); + ret = do_realtime((struct timespec *)tv); + tv->tv_usec /= 1000; } - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; + if (unlikely(tz != NULL)) { + /* Avoid memcpy. Some old compilers fail to inline it */ + tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; + tz->tz_dsttime = gtod->sys_tz.tz_dsttime; + } + + if (ret == VCLOCK_NONE) + return vdso_fallback_gtod(tv, tz); + return 0; } int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); -- cgit v1.2.3 From 6c260d586343f7f78239d90aa9e2cfed02f74ff3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:04 +0000 Subject: x86: vdso: Remove bogus locking in update_vsyscall_tz() Changing the sequence count in update_vsyscall_tz() is completely pointless. The vdso code copies the data unprotected. There is no point to change this as sys_tz is nowhere protected at all. See sys_gettimeofday(). Reviewed-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/kernel/vsyscall_64.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba9393564..33385c18e5d3 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -80,12 +80,7 @@ early_param("vsyscall", vsyscall_setup); void update_vsyscall_tz(void) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, -- cgit v1.2.3 From 2ab516575f2f273b19d95140d02c54612201e80a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:04 +0000 Subject: x86: vdso: Use seqcount instead of seqlock The update of the vdso data happens under xtime_lock, so adding a nested lock is pointless. Just use a seqcount to sync the readers. Reviewed-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/kernel/vsyscall_64.c | 11 +++-------- arch/x86/vdso/vclock_gettime.c | 16 ++++++++-------- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 815285bcaceb..1f007178c813 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,7 +5,7 @@ #include struct vsyscall_gtod_data { - seqlock_t lock; + seqcount_t seq; /* open coded 'struct timespec' */ time_t wall_time_sec; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 33385c18e5d3..cdc95a707cd1 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,10 +52,7 @@ #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = -{ - .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), -}; +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -86,9 +83,7 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_seqcount_begin(&vsyscall_gtod_data.seq); /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -101,7 +96,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_seqcount_end(&vsyscall_gtod_data.seq); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 7eeb1f6188ee..944c5e5d6b6a 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -100,12 +100,12 @@ notrace static noinline int do_realtime(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); return mode; @@ -117,13 +117,13 @@ notrace static noinline int do_monotonic(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec * are all guaranteed to be nonnegative. @@ -142,10 +142,10 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); return 0; } @@ -153,12 +153,12 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); secs = gtod->wall_time_coarse.tv_sec; ns = gtod->wall_time_coarse.tv_nsec; secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec and wall_to_monotonic.tv_nsec are * guaranteed to be between 0 and NSEC_PER_SEC. -- cgit v1.2.3 From 57779dc2b3b75bee05ef5d1ada47f615f7a13932 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 21 Feb 2012 18:19:55 -0800 Subject: x86, tsc: Skip refined tsc calibration on systems with reliable TSC While running the latest Linux as guest under VMware in highly over-committed situations, we have seen cases when the refined TSC algorithm fails to get a valid tsc_start value in tsc_refine_calibration_work from multiple attempts. As a result the kernel keeps on scheduling the tsc_irqwork task for later. Subsequently after several attempts when it gets a valid start value it goes through the refined calibration and either bails out or uses the new results. Given that the kernel originally read the TSC frequency from the platform, which is the best it can get, I don't think there is much value in refining it. So for systems which get the TSC frequency from the platform we should skip the refined tsc algorithm. We can use the TSC_RELIABLE cpu cap flag to detect this, right now it is set only on VMware and for Moorestown Penwell both of which have there own TSC calibration methods. Signed-off-by: Alok N Kataria Cc: John Stultz Cc: Dirk Brandewie Cc: Alan Cox Cc: stable@kernel.org [jstultz: Reworked to simply not schedule the refining work, rather then scheduling the work and bombing out later] Signed-off-by: John Stultz --- arch/x86/kernel/tsc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..6fcfcb3865c2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -932,6 +932,16 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } + + /* + * Trust the results of the earlier calibration on systems + * exporting a reliable TSC. + */ + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + clocksource_register_khz(&clocksource_tsc, tsc_khz); + return 0; + } + schedule_delayed_work(&tsc_irqwork, 0); return 0; } -- cgit v1.2.3 From 641cc938815dfd09f8fa1ec72deb814f0938ac33 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Mar 2012 20:09:14 +0100 Subject: perf: Adding sysfs group format attribute for pmu device Adding sysfs group 'format' attribute for pmu device that contains a syntax description on how to construct raw events. The event configuration is described in following struct pefr_event_attr attributes: config config1 config2 Each sysfs attribute within the format attribute group, describes mapping of name and bitfield definition within one of above attributes. eg: "/sys/.../format/event" contains "config:0-7" "/sys/.../format/umask" contains "config:8-15" "/sys/.../format/usr" contains "config:16" the attribute value syntax is: line: config ':' bits config: 'config' | 'config1' | 'config2" bits: bits ',' bit_term | bit_term bit_term: VALUE '-' VALUE | VALUE Adding format attribute definitions for x86 cpu pmus. Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/n/tip-vhdk5y2hyype9j63prymty36@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- .../testing/sysfs-bus-event_source-devices-format | 14 +++++++++ arch/x86/kernel/cpu/perf_event.c | 7 +++++ arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_amd.c | 18 +++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 36 ++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_p6.c | 19 ++++++++++++ include/linux/perf_event.h | 14 +++++++++ 7 files changed, 109 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-format (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format new file mode 100644 index 000000000000..079afc71363d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format @@ -0,0 +1,14 @@ +Where: /sys/bus/event_source/devices//format +Date: January 2012 +Kernel Version: 3.3 +Contact: Jiri Olsa +Description: + Attribute group to describe the magic bits that go into + perf_event_attr::config[012] for a particular pmu. + Each attribute of this group defines the 'hardware' bitmask + we want to export, so that userspace can deal with sane + name/value pairs. + + Example: 'config1:1,6-10,44' + Defines contents of attribute that occupies bits 1,6-10,44 of + perf_event_attr::config1. diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16cb58d..453ac9497574 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1314,6 +1314,11 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1388,6 +1393,7 @@ static int __init init_hw_perf_events(void) } x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + x86_pmu_format_group.attrs = x86_pmu.format_attrs; pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); @@ -1668,6 +1674,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, + &x86_pmu_format_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8484e77c211e..6638aaf54493 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -339,6 +339,7 @@ struct x86_pmu { * sysfs attrs */ int attr_rdpmc; + struct attribute **format_attrs; /* * CPU Hotplug hooks diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index dd002faff7a6..95e7fe1c5f0b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -404,6 +404,21 @@ static void amd_pmu_cpu_dead(int cpu) } } +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -426,6 +441,8 @@ static __initconst const struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, @@ -596,6 +613,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .cpu_dead = amd_pmu_cpu_dead, #endif .cpu_starting = amd_pmu_cpu_starting, + .format_attrs = amd_format_attr, }; __init int amd_pmu_init(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6a84e7f28f05..26b3e2fef104 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1431,6 +1431,24 @@ static void core_pmu_enable_all(int added) } } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1455,6 +1473,7 @@ static __initconst const struct x86_pmu core_pmu = { .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1553,6 +1572,21 @@ static void intel_pmu_flush_branch_stack(void) intel_pmu_lbr_reset(); } +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + NULL, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1576,6 +1610,8 @@ static __initconst const struct x86_pmu intel_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .format_attrs = intel_arch3_formats_attr, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index c7181befecde..32bcfc7dd230 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event) (void)checking_wrmsrl(hwc->config_base, val); } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = x86_pmu_handle_irq, @@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = { .cntval_mask = (1ULL << 32) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, }; __init int p6_pmu_init(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bd9f55a5958d..57ae485e80fc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -550,6 +550,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include #define PERF_MAX_STACK_DEPTH 255 @@ -1291,5 +1292,18 @@ do { \ register_cpu_notifier(&fn##_nb); \ } while (0) + +#define PMU_FORMAT_ATTR(_name, _format) \ +static ssize_t \ +_name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ + \ +static struct device_attribute format_attr_##_name = __ATTR_RO(_name) + #endif /* __KERNEL__ */ #endif /* _LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From c7b738351ba92f48b943ac59aff6b5b0f17f37c9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 5 Mar 2012 21:06:14 +0300 Subject: x86, efi: Fix pointer math issue in handle_ramdisks() "filename" is a efi_char16_t string so this check for reaching the end of the array doesn't work. We need to cast the pointer to (u8 *) before doing the math. This patch changes the "filename" to "filename_16" to avoid confusion in the future. Signed-off-by: Dan Carpenter Link: http://lkml.kernel.org/r/20120305180614.GA26880@elgon.mountain Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index fec216f4fbc3..0cdfc0d2315e 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -539,7 +539,7 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, struct initrd *initrd; efi_file_handle_t *h; efi_file_info_t *info; - efi_char16_t filename[256]; + efi_char16_t filename_16[256]; unsigned long info_sz; efi_guid_t info_guid = EFI_FILE_INFO_ID; efi_char16_t *p; @@ -552,14 +552,14 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, str += 7; initrd = &initrds[i]; - p = filename; + p = filename_16; /* Skip any leading slashes */ while (*str == '/' || *str == '\\') str++; while (*str && *str != ' ' && *str != '\n') { - if (p >= filename + sizeof(filename)) + if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16)) break; *p++ = *str++; @@ -583,7 +583,7 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, goto free_initrds; } - status = efi_call_phys5(fh->open, fh, &h, filename, + status = efi_call_phys5(fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); if (status != EFI_SUCCESS) goto close_handles; -- cgit v1.2.3 From 943bc7e110f269f88dc92bbf249adbd384d35f1c Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Thu, 15 Mar 2012 12:16:28 +0100 Subject: x86: Fix section warnings Fix the following section warnings : WARNING: vmlinux.o(.text+0x49dbc): Section mismatch in reference from the function acpi_map_cpu2node() to the variable .cpuinit.data:__apicid_to_node The function acpi_map_cpu2node() references the variable __cpuinitdata __apicid_to_node. This is often because acpi_map_cpu2node lacks a __cpuinitdata annotation or the annotation of __apicid_to_node is wrong. WARNING: vmlinux.o(.text+0x49dc1): Section mismatch in reference from the function acpi_map_cpu2node() to the function .cpuinit.text:numa_set_node() The function acpi_map_cpu2node() references the function __cpuinit numa_set_node(). This is often because acpi_map_cpu2node lacks a __cpuinit annotation or the annotation of numa_set_node is wrong. WARNING: vmlinux.o(.text+0x526e77): Section mismatch in reference from the function prealloc_protection_domains() to the function .init.text:alloc_passthrough_domain() The function prealloc_protection_domains() references the function __init alloc_passthrough_domain(). This is often because prealloc_protection_domains lacks a __init annotation or the annotation of alloc_passthrough_domain is wrong. Signed-off-by: Steffen Persvold Link: http://lkml.kernel.org/r/1331810188-24785-1-git-send-email-sp@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f33ea8e..406ed77216d0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index f75e0608be5b..ae2ec929e52f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2804,7 +2804,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) * we don't need to preallocate the protection domains anymore. * For now we have to. */ -static void prealloc_protection_domains(void) +static void __init prealloc_protection_domains(void) { struct iommu_dev_data *dev_data; struct dma_ops_domain *dma_dom; -- cgit v1.2.3 From dc72d99dabb870ca5bd6d9fff674be853bb4a88d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 18 Mar 2012 02:40:48 +0000 Subject: net: bpf_jit: fix BPF_S_LDX_B_MSH compilation Matt Evans spotted that x86 bpf_jit was incorrectly handling negative constant offsets in BPF_S_LDX_B_MSH instruction. We need to abort JIT compilation like we do in common_load so that filter uses the interpreter code and can call __load_pointer() Reference: http://lists.openwall.net/netdev/2011/07/19/11 Thanks to Indan Zupancic to bring back this issue. Reported-by: Matt Evans Reported-by: Indan Zupancic Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7c1b765ecc59..5671752f8d9c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -475,8 +475,10 @@ void bpf_jit_compile(struct sk_filter *fp) case BPF_S_LD_W_ABS: func = sk_load_word; common_load: seen |= SEEN_DATAREF; - if ((int)K < 0) + if ((int)K < 0) { + /* Abort the JIT because __load_pointer() is needed. */ goto out; + } t_offset = func - (image + addrs[i]); EMIT1_off32(0xbe, K); /* mov imm32,%esi */ EMIT1_off32(0xe8, t_offset); /* call */ @@ -489,14 +491,8 @@ common_load: seen |= SEEN_DATAREF; goto common_load; case BPF_S_LDX_B_MSH: if ((int)K < 0) { - if (pc_ret0 > 0) { - /* addrs[pc_ret0 - 1] is the start address */ - EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]); - break; - } - CLEAR_A(); - EMIT_JMP(cleanup_addr - addrs[i]); - break; + /* Abort the JIT because __load_pointer() is needed. */ + goto out; } seen |= SEEN_DATAREF | SEEN_XREG; t_offset = sk_load_byte_msh - (image + addrs[i]); -- cgit v1.2.3 From b74f05d61b73af584d0c39121980171389ecfaaa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 13 Feb 2012 11:07:27 -0200 Subject: x86: kvmclock: abstract save/restore sched_clock_state Upon resume from hibernation, CPU 0's hvclock area contains the old values for system_time and tsc_timestamp. It is necessary for the hypervisor to update these values with uptodate ones before the CPU uses them. Abstract TSC's save/restore sched_clock_state functions and use restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update. Also move restore_sched_clock_state before __restore_processor_state, since the later calls CONFIG_LOCK_STAT's lockstat_clock (also for TSC). Thanks to Igor Mammedov for tracking it down. Fixes suspend-to-disk with kvmclock. Reviewed-by: Thomas Gleixner Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/kvmclock.c | 11 +++++++++++ arch/x86/kernel/tsc.c | 4 ++-- arch/x86/kernel/x86_init.c | 4 +++- arch/x86/power/cpu.c | 4 ++-- 6 files changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 15d99153a96d..c91e8b9d588b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); -extern void save_sched_clock_state(void); -extern void restore_sched_clock_state(void); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5d0afac2962c..baaca8defec8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -162,6 +162,8 @@ struct x86_cpuinit_ops { * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus * @i8042_detect pre-detect if i8042 controller exists + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -173,6 +175,8 @@ struct x86_platform_ops { void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca4e735adc54..f8492da65bfc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -195,6 +204,8 @@ void __init kvmclock_init(void) x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97ec..aed2aa1088f1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -629,7 +629,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -645,7 +645,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f2ec53deed0..e9f265fd79ae 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -108,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb4..0e76a2814127 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -114,7 +114,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); - save_sched_clock_state(); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -230,8 +230,8 @@ static void __restore_processor_state(struct saved_context *ctxt) /* Needed by apm.c */ void restore_processor_state(void) { + x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); - restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); -- cgit v1.2.3 From 02626b6af5d2bc62db3bb85fc2891b2725535d44 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 8 Mar 2012 18:46:57 -0300 Subject: KVM: x86: fix kvm_write_tsc() TSC matching thinko kvm_write_tsc() converts from guest TSC to microseconds, not nanoseconds as intended. The result is that the window for matching is 1000 seconds, not 1 second. Microsecond precision is enough for checking whether the TSC write delta is within the heuristic values, so use it instead of nanoseconds. Noted by Avi Kivity. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32096cf6c6c9..7287812eeb72 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 nsdiff; + s64 usdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1033,18 +1033,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) elapsed = ns - kvm->arch.last_tsc_nsec; /* n.b - signed multiplication and division required */ - nsdiff = data - kvm->arch.last_tsc_write; + usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 - nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ asm("idivl %2; xor %%edx, %%edx" - : "=A"(nsdiff) - : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + : "=A"(usdiff) + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); #endif - nsdiff -= elapsed; - if (nsdiff < 0) - nsdiff = -nsdiff; + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; /* * Special case: TSC write with a small delta (1 second) of virtual @@ -1056,7 +1057,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (nsdiff < NSEC_PER_SEC && + if (usdiff < USEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; -- cgit v1.2.3 From 8fd75e1216e0ba601a746177e6c102d5593b572f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:17 +0800 Subject: x86: remove the second argument of k[un]map_atomic() Acked-by: Avi Kivity Acked-by: Herbert Xu Signed-off-by: Cong Wang --- arch/x86/crypto/aesni-intel_glue.c | 24 ++++++++++++------------ arch/x86/kernel/crash_dump_32.c | 6 +++--- arch/x86/kvm/lapic.c | 8 ++++---- arch/x86/kvm/paging_tmpl.h | 4 ++-- arch/x86/kvm/x86.c | 8 ++++---- arch/x86/lib/usercopy_32.c | 4 ++-- 6 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 545d0ce59818..152232d2dc6a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1107,12 +1107,12 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk, 0); - assoc = scatterwalk_map(&assoc_sg_walk, 0); + src = scatterwalk_map(&src_sg_walk); + assoc = scatterwalk_map(&assoc_sg_walk); dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk, 0); + dst = scatterwalk_map(&dst_sg_walk); } } else { @@ -1136,11 +1136,11 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) * back to the packet. */ if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst, 0); + scatterwalk_unmap(dst); scatterwalk_done(&dst_sg_walk, 0, 0); } - scatterwalk_unmap(src, 0); - scatterwalk_unmap(assoc, 0); + scatterwalk_unmap(src); + scatterwalk_unmap(assoc); scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { @@ -1189,12 +1189,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk, 0); - assoc = scatterwalk_map(&assoc_sg_walk, 0); + src = scatterwalk_map(&src_sg_walk); + assoc = scatterwalk_map(&assoc_sg_walk); dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk, 0); + dst = scatterwalk_map(&dst_sg_walk); } } else { @@ -1219,11 +1219,11 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst, 0); + scatterwalk_unmap(dst); scatterwalk_done(&dst_sg_walk, 0, 0); } - scatterwalk_unmap(src, 0); - scatterwalk_unmap(assoc, 0); + scatterwalk_unmap(src); + scatterwalk_unmap(assoc); scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 642f75a68cd5..11891ca7b716 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!userbuf) { memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); } else { if (!kdump_buf_page) { printk(KERN_WARNING "Kdump: Kdump buffer page not" " allocated\n"); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); return -EFAULT; } copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr, KM_PTE0); + kunmap_atomic(vaddr); if (copy_to_user(buf, (kdump_buf_page + offset), csize)) return -EFAULT; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cfdc6e0ef002..31bfc6927bc0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1283,9 +1283,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) return; - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + vapic = kmap_atomic(vcpu->arch.apic->vapic_page); data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic, KM_USER0); + kunmap_atomic(vapic); apic_set_tpr(vcpu->arch.apic, data & 0xff); } @@ -1310,9 +1310,9 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + vapic = kmap_atomic(vcpu->arch.apic->vapic_page); *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic, KM_USER0); + kunmap_atomic(vapic); } void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15610285ebb6..df5a70311be8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -92,9 +92,9 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (unlikely(npages != 1)) return -EFAULT; - table = kmap_atomic(page, KM_USER0); + table = kmap_atomic(page); ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); + kunmap_atomic(table); kvm_release_page_dirty(page); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc0698118..bb4fd2636bc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1162,12 +1162,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + shared_kaddr = kmap_atomic(vcpu->time_page); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); - kunmap_atomic(shared_kaddr, KM_USER0); + kunmap_atomic(shared_kaddr); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); return 0; @@ -3848,7 +3848,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, goto emul_write; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); kaddr += offset_in_page(gpa); switch (bytes) { case 1: @@ -3866,7 +3866,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, default: BUG(); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); kvm_release_page_dirty(page); if (!exchanged) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index e218d5df85ff..d9b094ca7aaa 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -760,9 +760,9 @@ survive: break; } - maddr = kmap_atomic(pg, KM_USER0); + maddr = kmap_atomic(pg); memcpy(maddr + offset, from, len); - kunmap_atomic(maddr, KM_USER0); + kunmap_atomic(maddr); set_page_dirty_lock(pg); put_page(pg); up_read(¤t->mm->mmap_sem); -- cgit v1.2.3 From a24401bcf4a67c8fe17e649e74eeb09b08b79ef5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 26 Nov 2011 10:53:39 +0800 Subject: highmem: kill all __kmap_atomic() [swarren@nvidia.com: highmem: Fix ARM build break due to __kmap_atomic rename] Signed-off-by: Stephen Warren Signed-off-by: Cong Wang --- arch/arm/include/asm/highmem.h | 2 +- arch/arm/mm/highmem.c | 4 ++-- arch/frv/include/asm/highmem.h | 2 +- arch/frv/mm/highmem.c | 4 ++-- arch/mips/include/asm/highmem.h | 2 +- arch/mips/mm/highmem.c | 4 ++-- arch/mn10300/include/asm/highmem.h | 2 +- arch/parisc/include/asm/cacheflush.h | 2 +- arch/powerpc/include/asm/highmem.h | 2 +- arch/sparc/include/asm/highmem.h | 2 +- arch/sparc/mm/highmem.c | 4 ++-- arch/tile/include/asm/highmem.h | 2 +- arch/tile/mm/highmem.c | 4 ++-- arch/x86/include/asm/highmem.h | 2 +- arch/x86/mm/highmem_32.c | 4 ++-- include/linux/highmem.h | 11 +++-------- 16 files changed, 24 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index a4edd19dd3d6..8c5e828f484d 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -57,7 +57,7 @@ static inline void *kmap_high_get(struct page *page) #ifdef CONFIG_HIGHMEM extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(const void *ptr); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 807c0573abbe..5a21505d7550 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -36,7 +36,7 @@ void kunmap(struct page *page) } EXPORT_SYMBOL(kunmap); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned int idx; unsigned long vaddr; @@ -81,7 +81,7 @@ void *__kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index a8d6565d415d..716956a5317b 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -157,7 +157,7 @@ static inline void kunmap_atomic_primary(void *kvaddr, enum km_type type) pagefault_enable(); } -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c index fd7fcd4c2e33..31902c9d5be5 100644 --- a/arch/frv/mm/highmem.c +++ b/arch/frv/mm/highmem.c @@ -37,7 +37,7 @@ struct page *kmap_atomic_to_page(void *ptr) return virt_to_page(ptr); } -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long paddr; int type; @@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page) return NULL; } } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 77e644082a3b..2d91888c9b74 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -47,7 +47,7 @@ extern void kunmap_high(struct page *page); extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 3634c7ea06ac..aff57057a949 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL(kunmap); * kmaps are appropriate for short, tight code paths only. */ -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long vaddr; int idx, type; @@ -62,7 +62,7 @@ void *__kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h index bfe2d88604d9..7c137cd8aa37 100644 --- a/arch/mn10300/include/asm/highmem.h +++ b/arch/mn10300/include/asm/highmem.h @@ -70,7 +70,7 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline unsigned long __kmap_atomic(struct page *page) +static inline unsigned long kmap_atomic(struct page *page) { unsigned long vaddr; int idx, type; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index da601dd34c05..9f21ab0c02e3 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -140,7 +140,7 @@ static inline void *kmap(struct page *page) #define kunmap(page) kunmap_parisc(page_address(page)) -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { pagefault_disable(); return page_address(page); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index dbc264010d0b..caaf6e00630d 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -79,7 +79,7 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 3d7afbb7f4bb..3b6e00dd96e5 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -70,7 +70,7 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern struct page *kmap_atomic_to_page(void *vaddr); diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 77140a02c86a..055c66cf1bf4 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -30,7 +30,7 @@ #include #include -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long vaddr; long idx, type; @@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h index b2a6c5de79ab..fc8429a31c85 100644 --- a/arch/tile/include/asm/highmem.h +++ b/arch/tile/include/asm/highmem.h @@ -59,7 +59,7 @@ void *kmap_fix_kpte(struct page *page, int finished); /* This macro is used only in map_new_virtual() to map "page". */ #define kmap_prot page_to_kpgprot(page) -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index 31dbbd9afe47..ef8e5a62b6e3 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -224,12 +224,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { /* PAGE_NONE is a magic value that tells us to check immutability. */ return kmap_atomic_prot(page, PAGE_NONE); } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 3bd04022fd0c..302a323b3f67 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -61,7 +61,7 @@ void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, pgprot_t prot); -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index f4f29b19fac5..6f31ee56c008 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -51,11 +51,11 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 284ec5535f3d..6549ed75e0a7 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -55,12 +55,12 @@ static inline void kunmap(struct page *page) { } -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { pagefault_disable(); return page_address(page); } -#define kmap_atomic_prot(page, prot) __kmap_atomic(page) +#define kmap_atomic_prot(page, prot) kmap_atomic(page) static inline void __kunmap_atomic(void *addr) { @@ -121,15 +121,10 @@ static inline void kmap_atomic_idx_pop(void) #define NARG_(_2, _1, n, ...) n #define NARG(...) NARG_(__VA_ARGS__, 2, 1, :) -static inline void *kmap_atomic(struct page *page) -{ - return __kmap_atomic(page); -} - static inline void __deprecated *kmap_atomic_deprecated(struct page *page, enum km_type km) { - return __kmap_atomic(page); + return kmap_atomic(page); } #define kmap_atomic1(...) kmap_atomic(__VA_ARGS__) -- cgit v1.2.3 From 984165a37ca65d990419566d9af5dd247d03d2a0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 15 Dec 2011 22:28:37 +0000 Subject: x86, mrst: add msic_thermal platform support This will let the MSIC driver to create platform device for the thermal driver. Signed-off-by: Mika Westerberg Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Signed-off-by: Matthew Garrett --- arch/x86/platform/mrst/mrst.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 475e2cd0f3c3..229b8bf42cd9 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -686,6 +686,11 @@ static void *msic_ocd_platform_data(void *info) return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); } +static void *msic_thermal_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); +} + static const struct devs_id __initconst device_ids[] = { {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, @@ -705,6 +710,7 @@ static const struct devs_id __initconst device_ids[] = { {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data}, {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data}, {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data}, + {"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data}, {}, }; -- cgit v1.2.3 From 3197059af0762c191af23c0ce3fd6f8311c564e7 Mon Sep 17 00:00:00 2001 From: "Philip A. Prindeville" Date: Sat, 14 Jan 2012 01:45:39 -0700 Subject: geos: Platform driver for Geos and Geos2 single-board computers. Trivial platform driver for Traverse Technologies Geos and Geos2 single-board computers. Uses SMBIOS to identify platform. Based on progressive revisions of the leds-net5501 driver that was rewritten by Ed Wildgoose as a platform driver. Supports GPIO-based LEDs (3) and 1 polled button which is typically used for a soft reset. Signed-off-by: Philip Prindeville Reviewed-by: Ed Wildgoose Acked-by: Andres Salomon Cc: Richard Purdie Cc: Andrew Morton Signed-off-by: Matthew Garrett --- arch/x86/Kconfig | 7 +++ arch/x86/platform/geode/Makefile | 1 + arch/x86/platform/geode/geos.c | 128 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 arch/x86/platform/geode/geos.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..3a38c4c1d359 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2133,6 +2133,13 @@ config ALIX Note: You have to set alix.force=1 for boards with Award BIOS. +config GEOS + bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)" + select GPIOLIB + depends on DMI + ---help--- + This option enables system support for the Traverse Technologies GEOS. + endif # X86_32 config AMD_NB diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile index 07c9cd05021a..d8ba5644f2f6 100644 --- a/arch/x86/platform/geode/Makefile +++ b/arch/x86/platform/geode/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_ALIX) += alix.o +obj-$(CONFIG_GEOS) += geos.o diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c new file mode 100644 index 000000000000..c2e6d53558be --- /dev/null +++ b/arch/x86/platform/geode/geos.c @@ -0,0 +1,128 @@ +/* + * System Specific setup for Traverse Technologies GEOS. + * At the moment this means setup of GPIO control of LEDs. + * + * Copyright (C) 2008 Constantin Baranov + * Copyright (C) 2011 Ed Wildgoose + * and Philip Prindeville + * + * TODO: There are large similarities with leds-net5501.c + * by Alessandro Zummo + * In the future leds-net5501.c should be migrated over to platform + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct gpio_keys_button geos_gpio_buttons[] = { + { + .code = KEY_RESTART, + .gpio = 3, + .active_low = 1, + .desc = "Reset button", + .type = EV_KEY, + .wakeup = 0, + .debounce_interval = 100, + .can_disable = 0, + } +}; +static struct gpio_keys_platform_data geos_buttons_data = { + .buttons = geos_gpio_buttons, + .nbuttons = ARRAY_SIZE(geos_gpio_buttons), + .poll_interval = 20, +}; + +static struct platform_device geos_buttons_dev = { + .name = "gpio-keys-polled", + .id = 1, + .dev = { + .platform_data = &geos_buttons_data, + } +}; + +static struct gpio_led geos_leds[] = { + { + .name = "geos:1", + .gpio = 6, + .default_trigger = "default-on", + .active_low = 1, + }, + { + .name = "geos:2", + .gpio = 25, + .default_trigger = "default-off", + .active_low = 1, + }, + { + .name = "geos:3", + .gpio = 27, + .default_trigger = "default-off", + .active_low = 1, + }, +}; + +static struct gpio_led_platform_data geos_leds_data = { + .num_leds = ARRAY_SIZE(geos_leds), + .leds = geos_leds, +}; + +static struct platform_device geos_leds_dev = { + .name = "leds-gpio", + .id = -1, + .dev.platform_data = &geos_leds_data, +}; + +static struct __initdata platform_device *geos_devs[] = { + &geos_buttons_dev, + &geos_leds_dev, +}; + +static void __init register_geos(void) +{ + /* Setup LED control through leds-gpio driver */ + platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs)); +} + +static int __init geos_init(void) +{ + const char *vendor, *product; + + if (!is_geode()) + return 0; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor || strcmp(vendor, "Traverse Technologies")) + return 0; + + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product || strcmp(product, "Geos")) + return 0; + + printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n", + KBUILD_MODNAME, vendor, product); + + register_geos(); + + return 0; +} + +module_init(geos_init); + +MODULE_AUTHOR("Philip Prindeville "); +MODULE_DESCRIPTION("Traverse Technologies Geos System Setup"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8fc3dc5a3a17aa2b353886422bd89420619af211 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 17 Mar 2012 03:05:16 -0400 Subject: __register_binfmt() made void Just don't pass NULL to it - nobody does, anyway. Signed-off-by: Al Viro --- arch/alpha/kernel/binfmt_loader.c | 3 ++- arch/x86/ia32/ia32_aout.c | 3 ++- fs/binfmt_aout.c | 3 ++- fs/binfmt_elf.c | 3 ++- fs/binfmt_elf_fdpic.c | 3 ++- fs/binfmt_em86.c | 3 ++- fs/binfmt_flat.c | 3 ++- fs/binfmt_misc.c | 7 ++----- fs/binfmt_script.c | 3 ++- fs/binfmt_som.c | 3 ++- fs/exec.c | 6 ++---- include/linux/binfmts.h | 10 +++++----- 12 files changed, 27 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c index 3fcfad410130..d1f474d1d44d 100644 --- a/arch/alpha/kernel/binfmt_loader.c +++ b/arch/alpha/kernel/binfmt_loader.c @@ -46,6 +46,7 @@ static struct linux_binfmt loader_format = { static int __init init_loader_binfmt(void) { - return insert_binfmt(&loader_format); + insert_binfmt(&loader_format); + return 0; } arch_initcall(init_loader_binfmt); diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 39e49091f648..cdfc8dc43670 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -519,7 +519,8 @@ out: static int __init init_aout_binfmt(void) { - return register_binfmt(&aout_format); + register_binfmt(&aout_format); + return 0; } static void __exit exit_aout_binfmt(void) diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 1ff94054d35a..a543364ba29b 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -454,7 +454,8 @@ out: static int __init init_aout_binfmt(void) { - return register_binfmt(&aout_format); + register_binfmt(&aout_format); + return 0; } static void __exit exit_aout_binfmt(void) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 07d096c49920..f8ac4251877e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2077,7 +2077,8 @@ out: static int __init init_elf_binfmt(void) { - return register_binfmt(&elf_format); + register_binfmt(&elf_format); + return 0; } static void __exit exit_elf_binfmt(void) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 30745f459faf..e7afcb67a2d3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -91,7 +91,8 @@ static struct linux_binfmt elf_fdpic_format = { static int __init init_elf_fdpic_binfmt(void) { - return register_binfmt(&elf_fdpic_format); + register_binfmt(&elf_fdpic_format); + return 0; } static void __exit exit_elf_fdpic_binfmt(void) diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index b8e8b0acf9bd..2790c7e1912e 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = { static int __init init_em86_binfmt(void) { - return register_binfmt(&em86_format); + register_binfmt(&em86_format); + return 0; } static void __exit exit_em86_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1bffbe0ed778..68affab88146 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -950,7 +950,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) static int __init init_flat_binfmt(void) { - return register_binfmt(&flat_format); + register_binfmt(&flat_format); + return 0; } /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a9198dfd5f85..1ffb60355cae 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -726,11 +726,8 @@ static struct file_system_type bm_fs_type = { static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); - if (!err) { - err = insert_binfmt(&misc_format); - if (err) - unregister_filesystem(&bm_fs_type); - } + if (!err) + insert_binfmt(&misc_format); return err; } diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 396a9884591f..d3b8c1f63155 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -105,7 +105,8 @@ static struct linux_binfmt script_format = { static int __init init_script_binfmt(void) { - return register_binfmt(&script_format); + register_binfmt(&script_format); + return 0; } static void __exit exit_script_binfmt(void) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index cc8560f6c9b0..ec15972dd98a 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -289,7 +289,8 @@ static int load_som_library(struct file *f) static int __init init_som_binfmt(void) { - return register_binfmt(&som_format); + register_binfmt(&som_format); + return 0; } static void __exit exit_som_binfmt(void) diff --git a/fs/exec.c b/fs/exec.c index 153dee14fe55..2c5ae338773c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -79,15 +79,13 @@ static atomic_t call_count = ATOMIC_INIT(1); static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); -int __register_binfmt(struct linux_binfmt * fmt, int insert) +void __register_binfmt(struct linux_binfmt * fmt, int insert) { - if (!fmt) - return -EINVAL; + BUG_ON(!fmt); write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); - return 0; } EXPORT_SYMBOL(__register_binfmt); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 0092102db2de..366422bc1633 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -92,17 +92,17 @@ struct linux_binfmt { unsigned long min_coredump; /* minimal dump size */ }; -extern int __register_binfmt(struct linux_binfmt *fmt, int insert); +extern void __register_binfmt(struct linux_binfmt *fmt, int insert); /* Registration of default binfmt handlers */ -static inline int register_binfmt(struct linux_binfmt *fmt) +static inline void register_binfmt(struct linux_binfmt *fmt) { - return __register_binfmt(fmt, 0); + __register_binfmt(fmt, 0); } /* Same as above, but adds a new binfmt at the top of the list */ -static inline int insert_binfmt(struct linux_binfmt *fmt) +static inline void insert_binfmt(struct linux_binfmt *fmt) { - return __register_binfmt(fmt, 1); + __register_binfmt(fmt, 1); } extern void unregister_binfmt(struct linux_binfmt *); -- cgit v1.2.3 From 19e5109fef2c368ab3f8a5157270f87f4a7c0326 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Feb 2012 22:29:17 -0500 Subject: take removal of PF_FORKNOEXEC to flush_old_exec() Signed-off-by: Al Viro --- arch/x86/ia32/ia32_aout.c | 1 - fs/binfmt_aout.c | 1 - fs/binfmt_elf.c | 2 -- fs/binfmt_elf_fdpic.c | 3 --- fs/binfmt_flat.c | 1 - fs/binfmt_som.c | 1 - fs/exec.c | 2 +- 7 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index cdfc8dc43670..4c2e59a420b9 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -323,7 +323,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) } install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a543364ba29b..4d5e6d26578c 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -267,7 +267,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) } install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8ac4251877e..81878b78c9d4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -712,7 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_dentry; /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; /* Do this immediately, since STACK_TOP as used in setup_arg_pages @@ -934,7 +933,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index e7afcb67a2d3..c64bf5ee2df4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -335,8 +335,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = 0; current->mm->context.interp_fdpic_loadmap = 0; - current->flags &= ~PF_FORKNOEXEC; - #ifdef CONFIG_MMU elf_fdpic_arch_lay_out_mm(&exec_params, &interp_params, @@ -414,7 +412,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #endif install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) goto error_kill; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 68affab88146..04f61f0bdfde 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -902,7 +902,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) libinfo.lib_list[j].start_data:UNLOADED_LIB; install_exec_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; set_binfmt(&flat_format); diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index ec15972dd98a..e4fc746629a7 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -225,7 +225,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; /* OK, This is the point of no return */ - current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; setup_new_exec(bprm); diff --git a/fs/exec.c b/fs/exec.c index 2c5ae338773c..60478a0e7a37 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1110,7 +1110,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; -- cgit v1.2.3 From 1a5a9906d4e8d1976b701f889d8f35d54b928f25 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 21 Mar 2012 16:33:42 -0700 Subject: mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode In some cases it may happen that pmd_none_or_clear_bad() is called with the mmap_sem hold in read mode. In those cases the huge page faults can allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a false positive from pmd_bad() that will not like to see a pmd materializing as trans huge. It's not khugepaged causing the problem, khugepaged holds the mmap_sem in write mode (and all those sites must hold the mmap_sem in read mode to prevent pagetables to go away from under them, during code review it seems vm86 mode on 32bit kernels requires that too unless it's restricted to 1 thread per process or UP builds). The race is only with the huge pagefaults that can convert a pmd_none() into a pmd_trans_huge(). Effectively all these pmd_none_or_clear_bad() sites running with mmap_sem in read mode are somewhat speculative with the page faults, and the result is always undefined when they run simultaneously. This is probably why it wasn't common to run into this. For example if the madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page fault, the hugepage will not be zapped, if the page fault runs first it will be zapped. Altering pmd_bad() not to error out if it finds hugepmds won't be enough to fix this, because zap_pmd_range would then proceed to call zap_pte_range (which would be incorrect if the pmd become a pmd_trans_huge()). The simplest way to fix this is to read the pmd in the local stack (regardless of what we read, no need of actual CPU barriers, only compiler barrier needed), and be sure it is not changing under the code that computes its value. Even if the real pmd is changing under the value we hold on the stack, we don't care. If we actually end up in zap_pte_range it means the pmd was not none already and it was not huge, and it can't become huge from under us (khugepaged locking explained above). All we need is to enforce that there is no way anymore that in a code path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad can run into a hugepmd. The overhead of a barrier() is just a compiler tweak and should not be measurable (I only added it for THP builds). I don't exclude different compiler versions may have prevented the race too by caching the value of *pmd on the stack (that hasn't been verified, but it wouldn't be impossible considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines and there's no external function called in between pmd_trans_huge and pmd_none_or_clear_bad). if (pmd_trans_huge(*pmd)) { if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } if (pmd_none_or_clear_bad(pmd)) Because this race condition could be exercised without special privileges this was reported in CVE-2012-1179. The race was identified and fully explained by Ulrich who debugged it. I'm quoting his accurate explanation below, for reference. ====== start quote ======= mapcount 0 page_mapcount 1 kernel BUG at mm/huge_memory.c:1384! At some point prior to the panic, a "bad pmd ..." message similar to the following is logged on the console: mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7). The "bad pmd ..." message is logged by pmd_clear_bad() before it clears the page's PMD table entry. 143 void pmd_clear_bad(pmd_t *pmd) 144 { -> 145 pmd_ERROR(*pmd); 146 pmd_clear(pmd); 147 } After the PMD table entry has been cleared, there is an inconsistency between the actual number of PMD table entries that are mapping the page and the page's map count (_mapcount field in struct page). When the page is subsequently reclaimed, __split_huge_page() detects this inconsistency. 1381 if (mapcount != page_mapcount(page)) 1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1383 mapcount, page_mapcount(page)); -> 1384 BUG_ON(mapcount != page_mapcount(page)); The root cause of the problem is a race of two threads in a multithreaded process. Thread B incurs a page fault on a virtual address that has never been accessed (PMD entry is zero) while Thread A is executing an madvise() system call on a virtual address within the same 2 MB (huge page) range. virtual address space .---------------------. | | | | .-|---------------------| | | | | | |<-- B(fault) | | | 2 MB | |/////////////////////|-. huge < |/////////////////////| > A(range) page | |/////////////////////|-' | | | | | | '-|---------------------| | | | | '---------------------' - Thread A is executing an madvise(..., MADV_DONTNEED) system call on the virtual address range "A(range)" shown in the picture. sys_madvise // Acquire the semaphore in shared mode. down_read(¤t->mm->mmap_sem) ... madvise_vma switch (behavior) case MADV_DONTNEED: madvise_dontneed zap_page_range unmap_vmas unmap_page_range zap_pud_range zap_pmd_range // // Assume that this huge page has never been accessed. // I.e. content of the PMD entry is zero (not mapped). // if (pmd_trans_huge(*pmd)) { // We don't get here due to the above assumption. } // // Assume that Thread B incurred a page fault and .---------> // sneaks in here as shown below. | // | if (pmd_none_or_clear_bad(pmd)) | { | if (unlikely(pmd_bad(*pmd))) | pmd_clear_bad | { | pmd_ERROR | // Log "bad pmd ..." message here. | pmd_clear | // Clear the page's PMD entry. | // Thread B incremented the map count | // in page_add_new_anon_rmap(), but | // now the page is no longer mapped | // by a PMD entry (-> inconsistency). | } | } | v - Thread B is handling a page fault on virtual address "B(fault)" shown in the picture. ... do_page_fault __do_page_fault // Acquire the semaphore in shared mode. down_read_trylock(&mm->mmap_sem) ... handle_mm_fault if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) // We get here due to the above assumption (PMD entry is zero). do_huge_pmd_anonymous_page alloc_hugepage_vma // Allocate a new transparent huge page here. ... __do_huge_pmd_anonymous_page ... spin_lock(&mm->page_table_lock) ... page_add_new_anon_rmap // Here we increment the page's map count (starts at -1). atomic_set(&page->_mapcount, 0) set_pmd_at // Here we set the page's PMD entry which will be cleared // when Thread A calls pmd_clear_bad(). ... spin_unlock(&mm->page_table_lock) The mmap_sem does not prevent the race because both threads are acquiring it in shared mode (down_read). Thread B holds the page_table_lock while the page's map count and PMD table entry are updated. However, Thread A does not synchronize on that lock. ====== end quote ======= [akpm@linux-foundation.org: checkpatch fixes] Reported-by: Ulrich Obergfell Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Hugh Dickins Cc: Dave Jones Acked-by: Larry Woodman Acked-by: Rik van Riel Cc: [2.6.38+] Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 2 ++ fs/proc/task_mmu.c | 9 +++++++ include/asm-generic/pgtable.h | 61 +++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 4 +++ mm/memory.c | 16 +++++++++--- mm/mempolicy.c | 2 +- mm/mincore.c | 2 +- mm/pagewalk.c | 2 +- mm/swapfile.c | 4 +-- 9 files changed, 92 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..328cb37bb827 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) spinlock_t *ptl; int i; + down_write(&mm->mmap_sem); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: + up_write(&mm->mmap_sem); flush_tlb(); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7dcd2a250495..3efa7253523e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { spin_unlock(&walk->mm->page_table_lock); } + + if (pmd_trans_unstable(pmd)) + return 0; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(&walk->mm->page_table_lock); } + if (pmd_trans_unstable(pmd)) + return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, md->vma, addr); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15e..a03c098b0cce 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); #endif +#ifdef CONFIG_MMU + #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { @@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd) return 0; } #endif /* __HAVE_ARCH_PMD_WRITE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and + * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd + * into a null pmd and the transhuge page fault can convert a null pmd + * into an hugepmd or into a regular pmd (if the hugepage allocation + * fails). While holding the mmap_sem in read mode the pmd becomes + * stable and stops changing under us only if it's not null and not a + * transhuge pmd. When those races occurs and this function makes a + * difference vs the standard pmd_none_or_clear_bad, the result is + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. + */ +static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) +{ + /* depend on compiler for an atomic pmd read */ + pmd_t pmdval = *pmd; + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +#endif + if (pmd_none(pmdval)) + return 1; + if (unlikely(pmd_bad(pmdval))) { + if (!pmd_trans_huge(pmdval)) + pmd_clear_bad(pmd); + return 1; + } + return 0; +} + +/* + * This is a noop if Transparent Hugepage Support is not built into + * the kernel. Otherwise it is equivalent to + * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in + * places that already verified the pmd is not none and they want to + * walk ptes while holding the mmap sem in read mode (write mode don't + * need this). If THP is not enabled, the pmd can't go away under the + * code even if MADV_DONTNEED runs, but if THP is enabled we need to + * run a pmd_trans_unstable before walking the ptes after + * split_huge_page_pmd returns (because it may have run when the pmd + * become null, but then a page fault can map in a THP and not a + * regular page). + */ +static inline int pmd_trans_unstable(pmd_t *pmd) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return pmd_none_or_trans_huge_or_clear_bad(pmd); +#else + return 0; #endif +} + +#endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26c6f4ec20f4..37281816ff67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5230,6 +5230,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) @@ -5390,6 +5392,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index 347e5fad1cfa..e01abb908b6b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { + if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) - continue; + goto next; /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: cond_resched(); } while (pmd++, addr = next, addr != end); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 47296fee23db..0a3757067631 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { next = pmd_addr_end(addr, end); split_huge_page_pmd(vma->vm_mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, flags, private)) diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else mincore_pte_range(vma, pmd, addr, next, vec); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) diff --git a/mm/swapfile.c b/mm/swapfile.c index 00a962caab1a..44595a373e42 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (unlikely(pmd_trans_huge(*pmd))) - continue; - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) -- cgit v1.2.3 From cbde83e21c4fd50bfc4240408355c1e5d393063d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 21 Mar 2012 16:33:55 -0700 Subject: hugetlb: try to search again if it is really needed Search again only if some holes may be skipped in the first pass. [akpm@linux-foundation.org: clean up crazy compound definition] Signed-off-by: Xiao Guangrong Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8ecbb4bba4b3..c20e81c3425d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -309,9 +309,10 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev_vma; - unsigned long base = mm->mmap_base, addr = addr0; + unsigned long base = mm->mmap_base; + unsigned long addr = addr0; unsigned long largest_hole = mm->cached_hole_size; - int first_time = 1; + unsigned long start_addr; /* don't allow allocations above current base */ if (mm->free_area_cache > base) @@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, mm->free_area_cache = base; } try_again: + start_addr = mm->free_area_cache; + /* make sure it can fit in the remaining address space */ if (mm->free_area_cache < len) goto fail; @@ -368,10 +371,9 @@ fail: * if hint left us with no space for the requested * mapping then try again: */ - if (first_time) { + if (start_addr != base) { mm->free_area_cache = base; largest_hole = 0; - first_time = 0; goto try_again; } /* -- cgit v1.2.3 From b716ad953a2bc4a543143c1d9836b7007a4b182f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 21 Mar 2012 16:33:56 -0700 Subject: mm: search from free_area_cache for the bigger size If the required size is bigger than cached_hole_size it is better to search from free_area_cache - it is easier to get a free region, specifically for the 64 bit process whose address space is large enough Do it just as hugetlb_get_unmapped_area_topdown() in arch/x86/mm/hugetlbpage.c Signed-off-by: Xiao Guangrong Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/sys_x86_64.c | 34 +++++++++++++++++----------------- mm/mmap.c | 36 +++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 051489082d59..ef59642ff1bf 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - unsigned long tmp_addr = align_addr(addr - len, filp, - ALIGN_TOPDOWN); - - vma = find_vma(mm, tmp_addr); - if (!vma || tmp_addr + len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = tmp_addr; - } - - if (mm->mmap_base < len) - goto bottomup; + start_addr = addr = mm->free_area_cache; - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { addr = align_addr(addr, filp, ALIGN_TOPDOWN); @@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + bottomup: /* * A failed mmap() very likely causes application failure, diff --git a/mm/mmap.c b/mm/mmap.c index 4f31764d838f..9e0c0de2e7e3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1442,7 +1442,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -1466,22 +1466,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; + start_addr = addr = mm->free_area_cache; - /* make sure it can fit in the remaining address space */ - if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); - } - - if (mm->mmap_base < len) - goto bottomup; - - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { /* * Lookup failure means no vma is above this address, @@ -1501,7 +1493,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); -bottomup: +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + * + * Note: this is different with the case of bottomup + * which does the fully line-search, but we use find_vma + * here that causes some holes skipped. + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario -- cgit v1.2.3 From b69add218d32450d6604bc9080f6e33e19b06f5e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 21 Mar 2012 16:34:14 -0700 Subject: hugetlb: remove prev_vma from hugetlb_get_unmapped_area_topdown() After looking up the vma which covers or follows the cached search address, the following condition is always true: !prev_vma || (addr >= prev_vma->vm_end) so we can stop checking the previous VMA altogether. Signed-off-by: Xiao Guangrong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c20e81c3425d..f6679a7fb8ca 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -308,7 +308,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, { struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev_vma; + struct vm_area_struct *vma; unsigned long base = mm->mmap_base; unsigned long addr = addr0; unsigned long largest_hole = mm->cached_hole_size; @@ -340,22 +340,14 @@ try_again: if (!vma) return addr; - /* - * new region fits between prev_vma->vm_end and - * vma->vm_start, use it: - */ - prev_vma = vma->vm_prev; - if (addr + len <= vma->vm_start && - (!prev_vma || (addr >= prev_vma->vm_end))) { + if (addr + len <= vma->vm_start) { /* remember the address as a hint for next time */ mm->cached_hole_size = largest_hole; return (mm->free_area_cache = addr); - } else { + } else if (mm->free_area_cache == vma->vm_end) { /* pull free_area_cache down to the first hole */ - if (mm->free_area_cache == vma->vm_end) { - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } + mm->free_area_cache = vma->vm_start; + mm->cached_hole_size = largest_hole; } /* remember the largest hole we saw so far */ -- cgit v1.2.3 From d71b5a73fe9af42752c4329b087f7911b35f8f79 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 21 Mar 2012 16:34:16 -0700 Subject: numa_emulation: fix cpumask_of_node() Without this fix the cpumask_of_node() for a fake=numa=2 is: cpumask 0 ff cpumask 1 ff with the fix it's correct and it's set to: cpumask 0 55 cpumask 1 aa Signed-off-by: Andrea Arcangeli Cc: Andi Kleen Cc: Johannes Weiner Cc: David Rientjes Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa_emulation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 46db56845f18..740b0a355431 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = pb->nid; + emu_nid_to_phys[nid] = nid; pb->start += size; if (pb->start >= pb->end) { -- cgit v1.2.3 From 676a38046f4fba4e7418756c6f6fc25cf5976312 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 15 Mar 2012 22:11:51 +0200 Subject: crypto: camellia-x86_64 - module init/exit functions should be static This caused conflict with twofish-x86_64-3way when compiled into kernel, same function names and not static. Reported-by: Randy Dunlap Signed-off-by: Jussi Kivilinna Acked-by: Randy Dunlap Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia_glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 1ca36a93fd2f..3306dc0b139e 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1925,7 +1925,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -int __init init(void) +static int __init init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -1938,7 +1938,7 @@ int __init init(void) return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); } -void __exit fini(void) +static void __exit fini(void) { crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); } -- cgit v1.2.3 From ff0a70fe053614e763eb3ac88bfea9c5615fce3b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 15 Mar 2012 22:11:57 +0200 Subject: crypto: twofish-x86_64-3way - module init/exit functions should be static This caused conflict with camellia-x86_64 when compiled into kernel, same function names and not static. Reported-by: Randy Dunlap Signed-off-by: Jussi Kivilinna Acked-by: Randy Dunlap Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 408fc0c5814e..922ab24cce31 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -668,7 +668,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -int __init init(void) +static int __init init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -681,7 +681,7 @@ int __init init(void) return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); } -void __exit fini(void) +static void __exit fini(void) { crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); } -- cgit v1.2.3 From 13354dc412c36fe554f9904a92f1268c74af7e87 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 21 Mar 2012 22:50:08 +0100 Subject: x86-32: Fix typo for mq_getsetattr in syscall table Syscall 282 was mistakenly named mq_getsetaddr instead of mq_getsetattr. When building uClibc against the Linux kernel this would result in a shared library that doesn't provide the mq_getattr() and mq_setattr() functions. Signed-off-by: Thierry Reding Link: http://lkml.kernel.org/r/1332366608-2695-2-git-send-email-thierry.reding@avionic-design.de Cc: v3.3 Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscall_32.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index ce98e287c066..e7e67cc3c14b 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -288,7 +288,7 @@ 279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend 280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive 281 i386 mq_notify sys_mq_notify compat_sys_mq_notify -282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr +282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 283 i386 kexec_load sys_kexec_load compat_sys_kexec_load 284 i386 waitid sys_waitid compat_sys_waitid # 285 sys_setaltroot -- cgit v1.2.3 From 446e1c86d51d0823e003a43a2b85c430efce2733 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 22 Mar 2012 11:08:18 -0700 Subject: x86, boot: Correct CFLAGS for hostprogs This is a partial revert of commit: d40f833 "Restrict CFLAGS for hostprogs" The endian-manipulation macros in tools/include need , but the hostprogs in arch/x86/boot need several headers from the kernel build tree, which means we have to add the kernel headers to the include path. This picks up from the kernel tree, which gives a warning. Since this use of is intentional, add -D__EXPORTED_HEADERS__ to the command line to silence the warning. A better way to fix this would be to always install the exported kernel headers into $(objtree)/usr/include as a standard part of the kernel build, but that is a lot more involved. Reported-by: Linus Torvalds Acked-by: Matt Fleming Link: http://lkml.kernel.org/r/1330436245-24875-5-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 3e02148bb774..5a747dd884db 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -37,9 +37,9 @@ setup-y += video-bios.o targets += $(setup-y) hostprogs-y := mkcpustr tools/build -HOSTCFLAGS_mkcpustr.o := -I$(srctree)/arch/$(SRCARCH)/include -HOST_EXTRACFLAGS += -I$(objtree)/include -I$(srctree)/tools/include \ - -include $(srctree)/include/linux/kconfig.h +HOST_EXTRACFLAGS += -I$(srctree)/tools/include $(LINUXINCLUDE) \ + -D__EXPORTED_HEADERS__ + $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ -- cgit v1.2.3 From 639077fb69aec8112e5427210a83d0fb192969f0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 19 Mar 2012 15:16:48 -0500 Subject: kgdb: x86: Return all segment registers also in 64-bit mode Even if the content is always 0, gdb expects us to return also ds, es, fs, and gs while in x86-64 mode. Do this to avoid ugly errors on "info registers". [jason.wessel@windriver.com: adjust NUMREGBYTES for two new regs] Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- arch/x86/include/asm/kgdb.h | 10 +++++++--- arch/x86/kernel/kgdb.c | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 77e95f54570a..332f98c9111f 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -64,11 +64,15 @@ enum regnames { GDB_PS, /* 17 */ GDB_CS, /* 18 */ GDB_SS, /* 19 */ + GDB_DS, /* 20 */ + GDB_ES, /* 21 */ + GDB_FS, /* 22 */ + GDB_GS, /* 23 */ }; #define GDB_ORIG_AX 57 -#define DBG_MAX_REG_NUM 20 -/* 17 64 bit regs and 3 32 bit regs */ -#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#define DBG_MAX_REG_NUM 24 +/* 17 64 bit regs and 5 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (5 * 4)) #endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771acad..fdc37b3d0ce3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "ss", 4, offsetof(struct pt_regs, ss) }, { "ds", 4, offsetof(struct pt_regs, ds) }, { "es", 4, offsetof(struct pt_regs, es) }, - { "fs", 4, -1 }, - { "gs", 4, -1 }, #else { "ax", 8, offsetof(struct pt_regs, ax) }, { "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "flags", 4, offsetof(struct pt_regs, flags) }, { "cs", 4, offsetof(struct pt_regs, cs) }, { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, #endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, }; int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) -- cgit v1.2.3 From 29a2e2836ff9ea65a603c89df217f4198973a74f Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 22 Mar 2012 21:39:25 +0100 Subject: x86-32: Fix endless loop when processing signals for kernel tasks The problem occurs on !CONFIG_VM86 kernels [1] when a kernel-mode task returns from a system call with a pending signal. A real-life scenario is a child of 'khelper' returning from a failed kernel_execve() in ____call_usermodehelper() [ kernel/kmod.c ]. kernel_execve() fails due to a pending SIGKILL, which is the result of "kill -9 -1" (at least, busybox's init does it upon reboot). The loop is as follows: * syscall_exit_work: - work_pending: // start_of_the_loop - work_notify_sig: - do_notify_resume() - do_signal() - if (!user_mode(regs)) return; - resume_userspace // TIF_SIGPENDING is still set - work_pending // so we call work_pending => goto // start_of_the_loop More information can be found in another LKML thread: http://www.serverphorums.com/read.php?12,457826 [1] the problem was also seen on MIPS. Signed-off-by: Dmitry Adamushko Link: http://lkml.kernel.org/r/1332448765.2299.68.camel@dimm Cc: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 79d97e68f042..7b784f4ef1e4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -98,12 +98,6 @@ #endif .endm -#ifdef CONFIG_VM86 -#define resume_userspace_sig check_userspace -#else -#define resume_userspace_sig resume_userspace -#endif - /* * User gs save/restore * @@ -327,10 +321,19 @@ ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) -check_userspace: +resume_userspace_sig: +#ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from a syscall done in the kernel space, + * e.g. a failed kernel_execve(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace -- cgit v1.2.3 From c7206205d00ab375839bd6c7ddb247d600693c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Mar 2012 17:26:36 +0100 Subject: perf: Fix mmap_page capabilities and docs Complete the syscall-less self-profiling feature and address all complaints, namely: - capabilities, so we can detect what is actually available at runtime Add a capabilities field to perf_event_mmap_page to indicate what is actually available for use. - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending properly. - ABI documentation as to how all this stuff works. Also improve the documentation for the new features. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 ++++- include/linux/perf_event.h | 83 +++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 4 +- 3 files changed, 84 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 453ac9497574..4ef8104958ee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; + if (!x86_pmu.attr_rdpmc) + return 0; + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { idx -= X86_PMC_IDX_FIXED; idx |= 1 << 30; @@ -1706,14 +1709,19 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + userpg->cap_usr_time = 0; + userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; + userpg->cap_usr_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 57ae485e80fc..ca9ed4e6a286 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -299,18 +299,31 @@ struct perf_event_mmap_page { /* * Bits needed to read the hw events in user-space. * - * u32 seq; - * s64 count; + * u32 seq, time_mult, time_shift, idx, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; * * do { * seq = pc->lock; - * * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * idx = pc->index; + * count = pc->offset; + * if (pc->cap_usr_rdpmc && idx) { + * width = pc->pmc_width; + * pmc = rdpmc(idx - 1); + * } * * barrier(); * } while (pc->lock != seq); @@ -323,14 +336,57 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ - __u32 time_mult, time_shift; + union { + __u64 capabilities; + __u64 cap_usr_time : 1, + cap_usr_rdpmc : 1, + cap_____res : 62; + }; + + /* + * If cap_usr_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & ((1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if idx), improving the scaling: + * + * enabled += delta; + * if (idx) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[121]; /* align to 1k */ + __u64 __reserved[120]; /* align to 1k */ /* * Control data for the mmap() data buffer. @@ -347,6 +403,13 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; +/* + * Build time assertion that we keep the data_head at the intended location. + * IOW, validation we got the __reserved[] size right. + */ +extern char __assert_mmap_data_head_offset + [1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)]; + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index c61234b1a988..dc3b05272511 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } -void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - perf_update_user_clock(userpg, now); + arch_perf_update_userpage(userpg, now); barrier(); ++userpg->lock; -- cgit v1.2.3 From 0b8b8078cb4db2ebe9a20f2b2eaeb58988be32bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 22 Mar 2012 21:31:43 -0700 Subject: x86: Fix excessive MSR print out when show_msr is not specified Dave found: | During bootup, I now have 162 messages like this.. | [ 0.227346] MSR0000001b: 00000000fee00900 | [ 0.227465] MSR00000021: 0000000000000001 | [ 0.227584] MSR0000002a: 00000000c1c81400 | | commit 21c3fcf3e39353d4f21d50e257cc74f3204b1988 looks suspect. | It claims that it will only print these out if show_msr= is | passed, but that doesn't seem to be the case. Fix it by changing to the version that checks the index. Reported-and-tested-by: Dave Jones Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1332477103-4595-1-git-send-email-yinghai@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ade9c794ed98..b24032355a76 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -998,7 +998,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); - __print_cpu_msr(); + print_cpu_msr(c); } void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 280fb016bfb098f33df96016cfaa840db77ba2d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Mar 2012 13:12:38 +0100 Subject: x86/kconfig: Update defconfigs Link: http://lkml.kernel.org/n/tip-ahz3d8i1vxwj0379gv4tqcru@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 65 +++++++++++++------------------------ arch/x86/configs/x86_64_defconfig | 68 ++++++++++++++------------------------- 2 files changed, 47 insertions(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 2bf18059fbea..2d562821a88f 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -15,23 +15,28 @@ CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y -CONFIG_UTS_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_KPROBES=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y -CONFIG_SPARSE_IRQ=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y CONFIG_SCHED_SMT=y @@ -51,14 +56,12 @@ CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_COMPAT_VDSO is not set -CONFIG_PM=y +CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y -CONFIG_HIBERNATION=y CONFIG_ACPI_PROCFS=y CONFIG_ACPI_DOCK=y CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_DEBUG=y # CONFIG_CPU_FREQ_STAT is not set CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_PERFORMANCE=y @@ -69,7 +72,6 @@ CONFIG_PCI_MSI=y CONFIG_PCCARD=y CONFIG_YENTA=y CONFIG_HOTPLUG_PCI=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y @@ -120,7 +122,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_LOG=y CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y @@ -128,7 +129,6 @@ CONFIG_IP_NF_MANGLE=y CONFIG_NF_CONNTRACK_IPV6=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MATCH_IPV6HEADER=y -CONFIG_IP6_NF_TARGET_LOG=y CONFIG_IP6_NF_FILTER=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IP6_NF_MANGLE=y @@ -169,25 +169,21 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -CONFIG_NET_ETHERNET=y -CONFIG_NET_VENDOR_3COM=y +CONFIG_NETCONSOLE=y +CONFIG_BNX2=y +CONFIG_TIGON3=y CONFIG_NET_TULIP=y -CONFIG_NET_PCI=y -CONFIG_FORCEDETH=y CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_SKY2=y CONFIG_NE2K_PCI=y +CONFIG_FORCEDETH=y CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set -CONFIG_E1000=y -CONFIG_E1000E=y CONFIG_R8169=y -CONFIG_SKY2=y -CONFIG_TIGON3=y -CONFIG_BNX2=y -CONFIG_TR=y -CONFIG_NET_PCMCIA=y CONFIG_FDDI=y -CONFIG_NETCONSOLE=y +CONFIG_TR=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y @@ -196,6 +192,7 @@ CONFIG_INPUT_TABLET=y CONFIG_INPUT_TOUCHSCREEN=y CONFIG_INPUT_MISC=y CONFIG_VT_HW_CONSOLE_BINDING=y +# CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y @@ -205,7 +202,6 @@ CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y -# CONFIG_LEGACY_PTYS is not set CONFIG_HW_RANDOM=y CONFIG_NVRAM=y CONFIG_HPET=y @@ -220,7 +216,6 @@ CONFIG_DRM_I915=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y @@ -283,7 +278,6 @@ CONFIG_ZISOFS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y @@ -291,18 +285,6 @@ CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y @@ -317,13 +299,12 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set +CONFIG_DEBUG_STACK_USAGE=y CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 058a35b8286c..3cf137ad2789 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,4 +1,3 @@ -CONFIG_64BIT=y CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y @@ -16,26 +15,29 @@ CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_SCHED=y -CONFIG_UTS_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y -CONFIG_KALLSYMS_EXTRA_PASS=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_KPROBES=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_SGI_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y -CONFIG_SPARSE_IRQ=y CONFIG_CALGARY_IOMMU=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_STATS=y CONFIG_NR_CPUS=64 CONFIG_SCHED_SMT=y CONFIG_PREEMPT_VOLUNTARY=y @@ -53,27 +55,22 @@ CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_COMPAT_VDSO is not set -CONFIG_PM=y +CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y -CONFIG_HIBERNATION=y CONFIG_ACPI_PROCFS=y CONFIG_ACPI_DOCK=y CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_DEBUG=y # CONFIG_CPU_FREQ_STAT is not set CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_PCI_MMCONFIG=y -CONFIG_INTEL_IOMMU=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set CONFIG_PCIEPORTBUS=y CONFIG_PCCARD=y CONFIG_YENTA=y CONFIG_HOTPLUG_PCI=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_BINFMT_MISC=y CONFIG_IA32_EMULATION=y CONFIG_NET=y @@ -125,7 +122,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_LOG=y CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y @@ -133,7 +129,6 @@ CONFIG_IP_NF_MANGLE=y CONFIG_NF_CONNTRACK_IPV6=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MATCH_IPV6HEADER=y -CONFIG_IP6_NF_TARGET_LOG=y CONFIG_IP6_NF_FILTER=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IP6_NF_MANGLE=y @@ -172,20 +167,16 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -CONFIG_NET_ETHERNET=y -CONFIG_NET_VENDOR_3COM=y +CONFIG_NETCONSOLE=y +CONFIG_TIGON3=y CONFIG_NET_TULIP=y -CONFIG_NET_PCI=y -CONFIG_FORCEDETH=y CONFIG_E100=y -CONFIG_8139TOO=y CONFIG_E1000=y CONFIG_SKY2=y -CONFIG_TIGON3=y -CONFIG_TR=y -CONFIG_NET_PCMCIA=y +CONFIG_FORCEDETH=y +CONFIG_8139TOO=y CONFIG_FDDI=y -CONFIG_NETCONSOLE=y +CONFIG_TR=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y @@ -194,6 +185,7 @@ CONFIG_INPUT_TABLET=y CONFIG_INPUT_TOUCHSCREEN=y CONFIG_INPUT_MISC=y CONFIG_VT_HW_CONSOLE_BINDING=y +# CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y @@ -203,7 +195,6 @@ CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y -# CONFIG_LEGACY_PTYS is not set CONFIG_HW_RANDOM=y # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set @@ -221,7 +212,6 @@ CONFIG_DRM_I915_KMS=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y @@ -268,6 +258,10 @@ CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set CONFIG_DMADEVICES=y CONFIG_EEEPC_LAPTOP=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_STATS=y +CONFIG_INTEL_IOMMU=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set CONFIG_EFI_VARS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set @@ -284,7 +278,6 @@ CONFIG_ZISOFS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_NFS_FS=y @@ -292,18 +285,6 @@ CONFIG_NFS_V3=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y @@ -317,13 +298,12 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set +CONFIG_DEBUG_STACK_USAGE=y CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m CONFIG_DEBUG_BOOT_PARAMS=y -- cgit v1.2.3 From b7157acf429e6aef690646ba964b9ebd25049ec2 Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Fri, 16 Mar 2012 20:25:35 +0100 Subject: x86/apic: Add separate apic_id_valid() functions for selected apic drivers As suggested by Suresh Siddha and Yinghai Lu: For x2apic pre-enabled systems, apic driver is set already early through early_acpi_boot_init()/early_acpi_process_madt()/ acpi_parse_madt()/default_acpi_madt_oem_check() path so that apic_id_valid() checking will be sufficient during MADT and SRAT parsing. For non-x2apic pre-enabled systems, all apic ids should be less than 255. This allows us to substitute the checks in arch/x86/kernel/acpi/boot.c::acpi_parse_x2apic() and arch/x86/mm/srat.c::acpi_numa_x2apic_affinity_init() with apic->apic_id_valid(). In addition we can avoid feigning the x2apic cpu feature in the NumaChip apic code. The following apic drivers have separate apic_id_valid() functions which will accept x2apic type IDs : x2apic_phys x2apic_cluster x2apic_uv_x apic_numachip Signed-off-by: Steffen Persvold Cc: Suresh Siddha Cc: Daniel J Blueman Cc: Yinghai Lu Cc: Jack Steiner Link: http://lkml.kernel.org/r/1331925935-13372-1-git-send-email-sp@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/x2apic.h | 5 +++++ arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++++- arch/x86/mm/srat.c | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718c..d3eaac44860a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -535,7 +535,7 @@ static inline unsigned int read_apic_id(void) static inline int default_apic_id_valid(int apicid) { - return x2apic_mode || (apicid < 255); + return (apicid < 255); } extern void default_setup_apic_routing(void); diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 6bf5b8e478c0..92e54abf89e0 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -18,6 +18,11 @@ static const struct cpumask *x2apic_target_cpus(void) return cpu_online_mask; } +static int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + static int x2apic_apic_id_registered(void) { return 1; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 406ed77216d0..0f42c2f44311 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else acpi_register_lapic(apic_id, enabled); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index d9ea5f331ac5..899803e03214 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -229,11 +229,10 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; - setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 9193713060a9..48f3103b3c93 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,7 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bcd1db6eaca9..8a778db45e3a 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,7 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index fc4771425852..87bfa69e216e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } +static int uv_apic_id_valid(int apicid) +{ + return 1; +} + static int uv_apic_id_registered(void) { return 1; @@ -351,7 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1c1c4f46a7c1..efb5b4b93711 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -70,7 +70,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; - if (!cpu_has_x2apic && (apic_id >= 0xff)) { + if (!apic->apic_id_valid(apic_id)) { printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; -- cgit v1.2.3 From 4da7072ad6831a35a11341097ce477e18651bedd Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 20 Mar 2012 15:19:36 +0100 Subject: x86/io_apic: Move and reenable irq only when CONFIG_GENERIC_PENDING_IRQ=y This patch removes dead code from certain .config variations. When CONFIG_GENERIC_PENDING_IRQ=n irq move and reenable code is never get executed, nor do_unmask_irq variable updates its init value. Move the code under CONFIG_GENERIC_PENDING_IRQ macro. Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120320141935.GA24806@dhcp-26-207.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 101 +++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d10a66fc5a9..2c428c5d7ca3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2512,21 +2512,73 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; -static void ack_apic_level(struct irq_data *data) -{ - struct irq_cfg *cfg = data->chip_data; - int i, do_unmask_irq = 0, irq = data->irq; - unsigned long v; - - irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - do_unmask_irq = 1; mask_ioapic(cfg); + return true; } + return false; +} + +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ + if (unlikely(masked)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(cfg); + } +} +#else +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ + return false; +} +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ +} #endif +static void ack_apic_level(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + int i, irq = data->irq; + unsigned long v; + bool masked; + + irq_complete_move(cfg); + masked = ioapic_irqd_mask(data, cfg); + /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2581,38 +2633,7 @@ static void ack_apic_level(struct irq_data *data) eoi_ioapic_irq(irq, cfg); } - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(cfg)) - irq_move_masked_irq(data); - unmask_ioapic(cfg); - } + ioapic_irqd_unmask(data, cfg, masked); } #ifdef CONFIG_IRQ_REMAP -- cgit v1.2.3 From 91ec87d57fc38c529034e853687dfb7756de5406 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Mar 2012 21:15:51 -0700 Subject: x86-64: Simplify and optimize vdso clock_gettime monotonic variants We used to store the wall-to-monotonic offset and the realtime base. It's faster to precompute the monotonic base. This is about a 3% speedup on Sandy Bridge for CLOCK_MONOTONIC. It's much more impressive for CLOCK_MONOTONIC_COARSE. Signed-off-by: Andy Lutomirski Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 15 +++++++++------ arch/x86/kernel/vsyscall_64.c | 10 +++++++++- arch/x86/vdso/vclock_gettime.c | 38 ++++++++------------------------------ 3 files changed, 26 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 1f007178c813..8b38be2de9e1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -7,11 +7,6 @@ struct vsyscall_gtod_data { seqcount_t seq; - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - struct timezone sys_tz; struct { /* extract of a clocksource struct */ int vclock_mode; cycle_t cycle_last; @@ -19,8 +14,16 @@ struct vsyscall_gtod_data { u32 mult; u32 shift; } clock; - struct timespec wall_to_monotonic; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + u32 monotonic_time_nsec; + time_t monotonic_time_sec; + + struct timezone sys_tz; struct timespec wall_time_coarse; + struct timespec monotonic_time_coarse; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cdc95a707cd1..4285f1f404c2 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -84,6 +84,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { write_seqcount_begin(&vsyscall_gtod_data.seq); + struct timespec monotonic; /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -91,10 +92,17 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; + + monotonic = timespec_add(*wall_time, *wtm); + vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; + vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.monotonic_time_coarse = + timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); write_seqcount_end(&vsyscall_gtod_data.seq); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 944c5e5d6b6a..6eea70b8f384 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -113,27 +113,17 @@ notrace static noinline int do_realtime(struct timespec *ts) notrace static noinline int do_monotonic(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq, ns; int mode; do { seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; - secs = gtod->wall_time_sec; - ns = gtod->wall_time_nsec + vgetns(); - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_sec; + ts->tv_nsec = gtod->monotonic_time_nsec; + ns = vgetns(); } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec - * are all guaranteed to be nonnegative. - */ - while (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; + timespec_add_ns(ts, ns); return mode; } @@ -151,25 +141,13 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) notrace static noinline int do_monotonic_coarse(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq; do { seq = read_seqcount_begin(>od->seq); - secs = gtod->wall_time_coarse.tv_sec; - ns = gtod->wall_time_coarse.tv_nsec; - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; + ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - /* wall_time_nsec and wall_to_monotonic.tv_nsec are - * guaranteed to be between 0 and NSEC_PER_SEC. - */ - if (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; - return 0; } -- cgit v1.2.3 From 5f293474c4c6c4dc2baaf2dfd486748b5986de76 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Mar 2012 21:15:52 -0700 Subject: x86-64: Inline vdso clock_gettime helpers This is about a 3% speedup on Sandy Bridge. Signed-off-by: Andy Lutomirski Signed-off-by: John Stultz --- arch/x86/vdso/vclock_gettime.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6eea70b8f384..885eff49d6ab 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -94,7 +94,8 @@ notrace static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -notrace static noinline int do_realtime(struct timespec *ts) +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) { unsigned long seq, ns; int mode; @@ -111,7 +112,7 @@ notrace static noinline int do_realtime(struct timespec *ts) return mode; } -notrace static noinline int do_monotonic(struct timespec *ts) +notrace static int do_monotonic(struct timespec *ts) { unsigned long seq, ns; int mode; @@ -128,7 +129,7 @@ notrace static noinline int do_monotonic(struct timespec *ts) return mode; } -notrace static noinline int do_realtime_coarse(struct timespec *ts) +notrace static int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { @@ -139,7 +140,7 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) return 0; } -notrace static noinline int do_monotonic_coarse(struct timespec *ts) +notrace static int do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { -- cgit v1.2.3 From 307b1cd7ecd7f3dc5ce3d3860957f034f0abe4df Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 23 Mar 2012 15:02:03 -0700 Subject: bitops: rename for_each_set_bit_cont() in favor of analogous list.h function This renames for_each_set_bit_cont() to for_each_set_bit_from() because it is analogous to list_for_each_entry_from() in list.h rather than list_for_each_entry_continue(). This doesn't remove for_each_set_bit_cont() for now. Signed-off-by: Akinobu Mita Cc: Robert Richter Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- include/linux/bitops.h | 5 ++++- tools/perf/util/include/linux/bitops.h | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16cb58d..fa2900c0e398 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) /* Prefer fixed purpose counters */ if (x86_pmu.num_counters_fixed) { idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 94300fe46cce..a78e358f0c17 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -27,11 +27,14 @@ extern unsigned long __sw_hweight64(__u64 w); (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +#define for_each_set_bit_cont(bit, addr, size) \ + for_each_set_bit_from(bit, addr, size) + static __inline__ int get_bitmask_order(unsigned int count) { int order; diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index 62cdee78db7b..f1584833bd22 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -15,7 +15,7 @@ (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) -- cgit v1.2.3 From 0b2f4d4d76a09f02fa37bfa57909483448fac771 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 23 Mar 2012 15:02:06 -0700 Subject: x86: use for_each_clear_bit_from() Use for_each_clear_bit() to iterate over all the cleared bit in a memory region. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/irqinit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..43e2b1cff0a7 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -306,10 +306,10 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + i = FIRST_EXTERNAL_VECTOR; + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - if (!test_bit(i, used_vectors)) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } if (!acpi_ioapic && !of_ioapic) -- cgit v1.2.3 From 909af768e88867016f427264ae39d27a57b6a8ed Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 23 Mar 2012 15:02:51 -0700 Subject: coredump: remove VM_ALWAYSDUMP flag The motivation for this patchset was that I was looking at a way for a qemu-kvm process, to exclude the guest memory from its core dump, which can be quite large. There are already a number of filter flags in /proc//coredump_filter, however, these allow one to specify 'types' of kernel memory, not specific address ranges (which is needed in this case). Since there are no more vma flags available, the first patch eliminates the need for the 'VM_ALWAYSDUMP' flag. The flag is used internally by the kernel to mark vdso and vsyscall pages. However, it is simple enough to check if a vma covers a vdso or vsyscall page without the need for this flag. The second patch then replaces the 'VM_ALWAYSDUMP' flag with a new 'VM_NODUMP' flag, which can be set by userspace using new madvise flags: 'MADV_DONTDUMP', and unset via 'MADV_DODUMP'. The core dump filters continue to work the same as before unless 'MADV_DONTDUMP' is set on the region. The qemu code which implements this features is at: http://people.redhat.com/~jbaron/qemu-dump/qemu-dump.patch In my testing the qemu core dump shrunk from 383MB -> 13MB with this patch. I also believe that the 'MADV_DONTDUMP' flag might be useful for security sensitive apps, which might want to select which areas are dumped. This patch: The VM_ALWAYSDUMP flag is currently used by the coredump code to indicate that a vma is part of a vsyscall or vdso section. However, we can determine if a vma is in one these sections by checking it against the gate_vma and checking for a non-NULL return value from arch_vma_name(). Thus, freeing a valuable vma bit. Signed-off-by: Jason Baron Acked-by: Roland McGrath Cc: Chris Metcalf Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 3 +-- arch/hexagon/kernel/vdso.c | 3 +-- arch/mips/kernel/vdso.c | 3 +-- arch/powerpc/kernel/vdso.c | 10 ++-------- arch/s390/kernel/vdso.c | 10 ++-------- arch/sh/kernel/vsyscall/vsyscall.c | 3 +-- arch/tile/mm/elf.c | 8 +------- arch/unicore32/kernel/process.c | 2 +- arch/x86/um/mem_32.c | 8 -------- arch/x86/um/vdso/vma.c | 3 +-- arch/x86/vdso/vdso32-setup.c | 17 ++--------------- arch/x86/vdso/vma.c | 3 +-- fs/binfmt_elf.c | 27 +++++++++++++++++++++++++-- include/linux/mm.h | 1 - mm/memory.c | 8 +------- 15 files changed, 40 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index c2ae3cd331fe..219e4efee1a6 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -533,8 +533,7 @@ int vectors_user_mapping(void) struct mm_struct *mm = current->mm; return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYEXEC | - VM_ALWAYSDUMP | VM_RESERVED, + VM_MAYREAD | VM_MAYEXEC | VM_RESERVED, NULL); } diff --git a/arch/hexagon/kernel/vdso.c b/arch/hexagon/kernel/vdso.c index 16277c33308a..f212a453b527 100644 --- a/arch/hexagon/kernel/vdso.c +++ b/arch/hexagon/kernel/vdso.c @@ -78,8 +78,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) /* MAYWRITE to allow gdb to COW and set breakpoints. */ ret = install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, &vdso_page); if (ret) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index e5cdfd603f8f..0f1af58b036a 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -88,8 +88,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, &vdso_page); if (ret) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7d14bb697d40..d36ee1055f88 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -263,17 +263,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * the "data" page of the vDSO or you'll stop getting kernel updates * and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) { current->mm->context.vdso_base = 0; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index e704a9965f90..9c80138206b0 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -241,17 +241,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * on the "data" page of the vDSO or you'll stop getting kernel * updates and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) current->mm->context.vdso_base = 0; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 1d6d51a1ce79..5ca579720a09 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -73,8 +73,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | - VM_ALWAYSDUMP, + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, syscall_pages); if (unlikely(ret)) goto up_fail; diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 55e58e93bfc5..1a00fb64fc88 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -117,17 +117,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, /* * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. Dumping its - * contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to - * see what PC values meant. */ vdso_base = VDSO_BASE; retval = install_special_mapping(mm, vdso_base, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pages); #ifndef __tilegx__ diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 52edc2b62873..432b4291f37b 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -381,7 +381,7 @@ int vectors_user_mapping(void) return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | - VM_ALWAYSDUMP | VM_RESERVED, + VM_RESERVED, NULL); } diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 639900a6fde9..f40281e5d6a2 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -23,14 +23,6 @@ static int __init gate_vma_init(void) gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; - return 0; } __initcall(gate_vma_init); diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 91f4ec9a0a56..af91901babb8 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -64,8 +64,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdsop); up_write(&mm->mmap_sem); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 468d591dde31..a944020fa859 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -250,13 +250,7 @@ static int __init gate_vma_init(void) gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; } @@ -343,17 +337,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully - * interpretable later without matching up the same - * kernel and hardware config to see what PC values - * meant. */ ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso32_pages); if (ret) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 153407c35b75..17e18279649f 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -124,8 +124,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pages); if (ret) { current->mm->context.vdso = NULL; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 81878b78c9d4..b64be5b5ac21 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1092,6 +1092,29 @@ out: * Jeremy Fitzhardinge */ +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + /* * Decide what to dump of a segment, part, all or none. */ @@ -1100,8 +1123,8 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, { #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* The vma can be set up to tell us the answer directly. */ - if (vma->vm_flags & VM_ALWAYSDUMP) + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) goto whole; /* Hugetlb memory check */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7330742e7973..2de2ddba51d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -111,7 +111,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ #endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ -#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ diff --git a/mm/memory.c b/mm/memory.c index 3416b6e018d6..6105f475fa86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3623,13 +3623,7 @@ static int __init gate_vma_init(void) gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; } __initcall(gate_vma_init); -- cgit v1.2.3 From 65c0ff4079c011232e795e62c74a0a95512b7ac3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Mar 2012 14:02:55 -0700 Subject: x86: Stop recursive fault in print_context_stack after stack overflow After printing out the first line of a stack backtrace, print_context_stack() calls print_ftrace_graph_addr() to check if it's making a graph of function calls, usually not the case. But unfortunate ordering of assignments causes this to oops if an earlier stack overflow corrupted threadinfo->task. Reorder to avoid that irritation. ( The fact that there was a stack overflow may often be more interesting than the stack that can now be shown; but integrating that information with this stacktrace is awkward, so leave it to overflow reporting. ) Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Cc: Namhyung Kim Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20120323225648.15DD5A033B@akpm.mtv.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4025fe4f928f..90bf130f09bc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -37,13 +37,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, struct thread_info *tinfo, int *graph) { - struct task_struct *task = tinfo->task; + struct task_struct *task; unsigned long ret_addr; - int index = task->curr_ret_stack; + int index; if (addr != (unsigned long)return_to_handler) return; + task = tinfo->task; + index = task->curr_ret_stack; + if (!task->ret_stack || index < *graph) return; -- cgit v1.2.3 From f5243d6de7ae232e1d81e44ae9756bbd8c988fcd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 23 Mar 2012 16:22:50 -0700 Subject: x86/kconfig: Remove CONFIG_TR=y from the defconfigs Remove CONFIG_TR=y from the x86 defconfigs since token ring support is antiquated and obsolete. ( I reviewed both x86 defconfigs - I didn't come up with anything else that obviously should be removed. ) Signed-off-by: Randy Dunlap Link: http://lkml.kernel.org/r/4F6D05CA.2050801@xenotime.net [ Twiddled the changelog a bit ] Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 2d562821a88f..119db67dcb03 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -183,7 +183,6 @@ CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_R8169=y CONFIG_FDDI=y -CONFIG_TR=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 3cf137ad2789..76eb2903809f 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -176,7 +176,6 @@ CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y CONFIG_FDDI=y -CONFIG_TR=y CONFIG_INPUT_POLLDEV=y # CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y -- cgit v1.2.3 From 68fe7b23d559763a2e19e5fc1cf7036e4aaecb10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Mar 2012 09:29:22 +0100 Subject: x86: vdso: Put declaration before code Sigh, warnings are there for a reason. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: John Stultz --- arch/x86/kernel/vsyscall_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 4285f1f404c2..d5c69860b524 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -83,9 +83,10 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - write_seqcount_begin(&vsyscall_gtod_data.seq); struct timespec monotonic; + write_seqcount_begin(&vsyscall_gtod_data.seq); + /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; -- cgit v1.2.3 From c56334dbf7e8772ed84390bc4664427f0a7f3b25 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2011 17:23:39 -0500 Subject: um: merge processor_{32,64}.h a bit... Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/asm/processor.h | 10 ++++++++++ arch/x86/um/asm/processor_32.h | 10 ---------- arch/x86/um/asm/processor_64.h | 10 ---------- 3 files changed, 10 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 2c32df6fe231..04f82e020f2b 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -17,6 +17,16 @@ #define ARCH_IS_STACKGROW(address) \ (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(¤t->thread.regs.regs)) +#include + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + #include #endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h index 018f732704dd..6c6689e574ce 100644 --- a/arch/x86/um/asm/processor_32.h +++ b/arch/x86/um/asm/processor_32.h @@ -45,16 +45,6 @@ static inline void arch_copy_thread(struct arch_thread *from, memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array)); } -#include - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - /* * Default implementation of macro that returns current * instruction pointer ("program counter"). Stolen diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h index 61de92d916c3..4b02a8455bd1 100644 --- a/arch/x86/um/asm/processor_64.h +++ b/arch/x86/um/asm/processor_64.h @@ -14,14 +14,6 @@ struct arch_thread { struct faultinfo faultinfo; }; -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - #define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ .debugregs_seq = 0, \ .fs = 0, \ @@ -37,8 +29,6 @@ static inline void arch_copy_thread(struct arch_thread *from, to->fs = from->fs; } -#include - #define current_text_addr() \ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) -- cgit v1.2.3 From c2220b2a124d2fe7b0074b23680177c8e905a76c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Jan 2012 16:30:48 -0500 Subject: um: kill HOST_TASK_PID just provide get_current_pid() to the userland side of things instead of get_current() + manual poking in its results Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/um/include/shared/common-offsets.h | 2 -- arch/um/include/shared/kern_util.h | 2 +- arch/um/kernel/process.c | 4 ++-- arch/x86/um/bugs_32.c | 4 +--- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index d7fe563aa7e7..40db8f71deae 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -2,8 +2,6 @@ DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); -OFFSET(HOST_TASK_PID, task_struct, pid); - DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 0f1483852460..00965d06d2ca 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -48,7 +48,7 @@ extern void do_uml_exitcalls(void); * GFP_ATOMIC. */ extern int __cant_sleep(void); -extern void *get_current(void); +extern int get_current_pid(void); extern int copy_from_user_proc(void *to, void *from, int size); extern int cpu(void); extern char *uml_strdup(const char *string); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 69f24905abdc..f386d04a84a5 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -126,9 +126,9 @@ void exit_thread(void) { } -void *get_current(void) +int get_current_pid(void) { - return current; + return task_pid_nr(current); } /* diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c index a1fba5fb9dbe..17d88cf2c6c4 100644 --- a/arch/x86/um/bugs_32.c +++ b/arch/x86/um/bugs_32.c @@ -13,8 +13,6 @@ static int host_has_cmov = 1; static jmp_buf cmov_test_return; -#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID])) - static void cmov_sigill_test_handler(int sig) { host_has_cmov = 0; @@ -51,7 +49,7 @@ void arch_examine_signal(int sig, struct uml_pt_regs *regs) * This is testing for a cmov (0x0f 0x4x) instruction causing a * SIGILL in init. */ - if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) + if ((sig != SIGILL) || (get_current_pid() != 1)) return; if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { -- cgit v1.2.3 From dc5be20a6454312d395dbf07eb2218090a03ae24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Feb 2012 05:39:56 -0500 Subject: um: most of the SUBARCH uses can be killed Signed-off-by: Al Viro [richard@nod.at: Re-export SUBARCH in arch/um/Makefile] Signed-off-by: Richard Weinberger --- arch/um/Makefile | 4 ++-- arch/um/kernel/Makefile | 2 +- arch/x86/Makefile.um | 4 ---- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/Makefile b/arch/um/Makefile index 28688e6d96d7..4c993c89d0f0 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -50,7 +50,7 @@ KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/um # # These apply to USER_CFLAGS to. -KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ +KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ -Din6addr_loopback=kernel_in6addr_loopback \ -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr @@ -99,7 +99,7 @@ KBUILD_KCONFIG := $(HOST_DIR)/um/Kconfig archheaders: $(Q)$(MAKE) -C '$(srctree)' KBUILD_SRC= \ - ARCH=$(SUBARCH) O='$(objtree)' archheaders + ARCH=$(HEADER_ARCH) O='$(objtree)' archheaders archprepare: include/generated/user_constants.h diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index bc494741b1f3..492bc4c1b62b 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -3,7 +3,7 @@ # Licensed under the GPL # -CPPFLAGS_vmlinux.lds := -U$(SUBARCH) -DSTART=$(LDS_START) \ +CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) extra-y := vmlinux.lds diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 36ddec6a41c9..4be406abeefd 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -8,15 +8,11 @@ ELF_ARCH := i386 ELF_FORMAT := elf32-i386 CHECKFLAGS += -D__i386__ -ifeq ("$(origin SUBARCH)", "command line") -ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)") KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS += $(call cc-option,-m32) LINK-y += $(call cc-option,-m32) export LDFLAGS -endif -endif # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. include $(srctree)/arch/x86/Makefile_32.cpu -- cgit v1.2.3 From 4c3ff74742b481eaf32d010d072b421c97fd8f08 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Feb 2012 06:15:50 -0500 Subject: um: allow SUBARCH=x86 nicked from patch by dwmw2 back in July Signed-off-by: Al Viro Signed-off-by: Richard Weinberger --- arch/x86/um/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index b2b54d2edf53..9926e11a772d 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -15,8 +15,8 @@ config UML_X86 select GENERIC_FIND_FIRST_BIT config 64BIT - bool - default SUBARCH = "x86_64" + bool "64-bit kernel" if SUBARCH = "x86" + default SUBARCH != "i386" config X86_32 def_bool !64BIT -- cgit v1.2.3 From 90e240142bd31ff10aeda5a280a53153f4eff004 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 25 Mar 2012 23:00:04 +0200 Subject: x86: Merge the x86_32 and x86_64 cpu_idle() functions Both functions are mostly identical. The differences are: - x86_32's cpu_idle() makes use of check_pgt_cache(), which is a nop on both x86_32 and x86_64. - x86_64's cpu_idle() uses enter/__exit_idle/(), on x86_32 these function are a nop. - In contrast to x86_32, x86_64 calls rcu_idle_enter/exit() in the innermost loop because idle notifications need RCU. Calling these function on x86_32 also in the innermost loop does not hurt. So we can merge both functions. Signed-off-by: Richard Weinberger Acked-by: Frederic Weisbecker Cc: paulmck@linux.vnet.ibm.com Cc: josh@joshtriplett.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1332709204-22496-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 1 + arch/x86/kernel/process.c | 114 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 58 ---------------------- arch/x86/kernel/process_64.c | 107 ---------------------------------------- 4 files changed, 115 insertions(+), 165 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index f49253d75710..c5d1785373ed 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -14,6 +14,7 @@ void exit_idle(void); #else /* !CONFIG_X86_64 */ static inline void enter_idle(void) { } static inline void exit_idle(void) { } +static inline void __exit_idle(void) { } #endif /* CONFIG_X86_64 */ void amd_e400_remove_cpu(int cpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1f..29309c42b9e5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -23,6 +26,24 @@ #include #include #include +#include + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -371,6 +392,99 @@ static inline int hlt_use_halt(void) } #endif +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +#ifdef CONFIG_X86_64 +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; + + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_touch_nmi(); + local_irq_disable(); + + enter_idle(); + + /* Don't trace irqs off for idle */ + stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + + if (cpuidle_idle_call()) + pm_idle(); + + rcu_idle_exit(); + start_critical_timings(); + + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + /* * We use this if we don't have any better * idle routine.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfaf..ea207c245aa4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,7 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -31,14 +30,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include @@ -58,7 +55,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -70,60 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - int cpu = smp_processor_id(); - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - current_thread_info()->status |= TS_POLLING; - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - while (!need_resched()) { - - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) - play_dead(); - - local_touch_nmi(); - local_irq_disable(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - if (cpuidle_idle_call()) - pm_idle(); - start_critical_timings(); - } - rcu_idle_exit(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5aa..ce5e34f2beca 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,7 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -32,12 +31,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -52,114 +49,10 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); -static DEFINE_PER_CPU(unsigned char, is_idle); - -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - -void enter_idle(void) -{ - percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} - -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - current_thread_info()->status |= TS_POLLING; - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - while (!need_resched()) { - - rmb(); - - if (cpu_is_offline(smp_processor_id())) - play_dead(); - /* - * Idle routines should keep interrupts disabled - * from here on, until they go to idle. - * Otherwise, idle callbacks can misfire. - */ - local_touch_nmi(); - local_irq_disable(); - enter_idle(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - - /* enter_idle() needs rcu for notifiers */ - rcu_idle_enter(); - - if (cpuidle_idle_call()) - pm_idle(); - - rcu_idle_exit(); - start_critical_timings(); - - /* In many cases the interrupt that ended idle - has already called exit_idle. But some idle - loops can be woken up without interrupt. */ - __exit_idle(); - } - - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) -- cgit v1.2.3 From bc758133ed73d4b06952bec21da23e28e62bf3ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Mar 2012 13:16:15 +0200 Subject: sched/x86/smp: Do not enable IRQs over calibrate_delay() We should not ever enable IRQs until we're fully set up. This opens up a window where interrupts can hit the cpu and interrupts can do wakeups, wakeups need state that isn't set-up yet, in particular this cpu isn't elegible to run tasks, so if any cpu-affine task that got created in CPU_UP_PREPARE manages to get a wakeup, its affinity mask will get broken and we'll run into lots of 'interesting' problems. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-yaezmlbriluh166tfkgni22m@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58f78165d308..89571a0c4a49 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -219,14 +219,9 @@ static void __cpuinit smp_callin(void) * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as * accurate as the value just calculated. - * - * Need to enable IRQs because it can take longer and then - * the NMI watchdog might kill us. */ - local_irq_enable(); calibrate_delay(); cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; - local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* -- cgit v1.2.3 From a3c8121b8724c3d496dc00201ab40e8313edcf0d Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Tue, 27 Mar 2012 16:07:40 +0100 Subject: x86/olpc: Add debugfs interface for EC commands Add a debugfs interface for sending commands to the OLPC Embedded Controller (EC) and reading the responses. The EC provides functionality for machine identification, battery and AC control, wakeup control, etc. Having a debugfs interface available is useful for EC development and debugging. Based on code by Paul Fox (who also approves of the end result). Signed-off-by: Daniel Drake Acked-by: Paul Fox Cc: "H. Peter Anvin" Cc: Andres Salomon Link: http://lkml.kernel.org/r/20120327150740.667D09D401E@zog.reactivated.net Signed-off-by: Ingo Molnar --- Documentation/ABI/testing/debugfs-olpc | 16 ++++++ arch/x86/platform/olpc/olpc.c | 97 ++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-olpc (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/debugfs-olpc b/Documentation/ABI/testing/debugfs-olpc new file mode 100644 index 000000000000..bd76cc6d55f9 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-olpc @@ -0,0 +1,16 @@ +What: /sys/kernel/debug/olpc-ec/cmd +Date: Dec 2011 +KernelVersion: 3.4 +Contact: devel@lists.laptop.org +Description: + +A generic interface for executing OLPC Embedded Controller commands and +reading their responses. + +To execute a command, write data with the format: CC:N A A A A +CC is the (hex) command, N is the count of expected reply bytes, and A A A A +are optional (hex) arguments. + +To read the response (if any), read from the generic node after executing +a command. Hex reply bytes will be returned, *whether or not* they came from +the immediately previous command. diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 7cce722667b8..a4bee53c2e54 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -31,6 +33,15 @@ EXPORT_SYMBOL_GPL(olpc_platform_info); static DEFINE_SPINLOCK(ec_lock); +/* debugfs interface to EC commands */ +#define EC_MAX_CMD_ARGS (5 + 1) /* cmd byte + 5 args */ +#define EC_MAX_CMD_REPLY (8) + +static struct dentry *ec_debugfs_dir; +static DEFINE_MUTEX(ec_debugfs_cmd_lock); +static unsigned char ec_debugfs_resp[EC_MAX_CMD_REPLY]; +static unsigned int ec_debugfs_resp_bytes; + /* EC event mask to be applied during suspend (defining wakeup sources). */ static u16 ec_wakeup_mask; @@ -269,6 +280,91 @@ int olpc_ec_sci_query(u16 *sci_value) } EXPORT_SYMBOL_GPL(olpc_ec_sci_query); +static ssize_t ec_debugfs_cmd_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) +{ + int i, m; + unsigned char ec_cmd[EC_MAX_CMD_ARGS]; + unsigned int ec_cmd_int[EC_MAX_CMD_ARGS]; + char cmdbuf[64]; + int ec_cmd_bytes; + + mutex_lock(&ec_debugfs_cmd_lock); + + size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size); + + m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0], + &ec_debugfs_resp_bytes, + &ec_cmd_int[1], &ec_cmd_int[2], &ec_cmd_int[3], + &ec_cmd_int[4], &ec_cmd_int[5]); + if (m < 2 || ec_debugfs_resp_bytes > EC_MAX_CMD_REPLY) { + /* reset to prevent overflow on read */ + ec_debugfs_resp_bytes = 0; + + printk(KERN_DEBUG "olpc-ec: bad ec cmd: " + "cmd:response-count [arg1 [arg2 ...]]\n"); + size = -EINVAL; + goto out; + } + + /* convert scanf'd ints to char */ + ec_cmd_bytes = m - 2; + for (i = 0; i <= ec_cmd_bytes; i++) + ec_cmd[i] = ec_cmd_int[i]; + + printk(KERN_DEBUG "olpc-ec: debugfs cmd 0x%02x with %d args " + "%02x %02x %02x %02x %02x, want %d returns\n", + ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], ec_cmd[3], + ec_cmd[4], ec_cmd[5], ec_debugfs_resp_bytes); + + olpc_ec_cmd(ec_cmd[0], (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1], + ec_cmd_bytes, ec_debugfs_resp, ec_debugfs_resp_bytes); + + printk(KERN_DEBUG "olpc-ec: response " + "%02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n", + ec_debugfs_resp[0], ec_debugfs_resp[1], ec_debugfs_resp[2], + ec_debugfs_resp[3], ec_debugfs_resp[4], ec_debugfs_resp[5], + ec_debugfs_resp[6], ec_debugfs_resp[7], ec_debugfs_resp_bytes); + +out: + mutex_unlock(&ec_debugfs_cmd_lock); + return size; +} + +static ssize_t ec_debugfs_cmd_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + unsigned int i, r; + char *rp; + char respbuf[64]; + + mutex_lock(&ec_debugfs_cmd_lock); + rp = respbuf; + rp += sprintf(rp, "%02x", ec_debugfs_resp[0]); + for (i = 1; i < ec_debugfs_resp_bytes; i++) + rp += sprintf(rp, ", %02x", ec_debugfs_resp[i]); + mutex_unlock(&ec_debugfs_cmd_lock); + rp += sprintf(rp, "\n"); + + r = rp - respbuf; + return simple_read_from_buffer(buf, size, ppos, respbuf, r); +} + +static const struct file_operations ec_debugfs_genops = { + .write = ec_debugfs_cmd_write, + .read = ec_debugfs_cmd_read, +}; + +static void setup_debugfs(void) +{ + ec_debugfs_dir = debugfs_create_dir("olpc-ec", 0); + if (ec_debugfs_dir == ERR_PTR(-ENODEV)) + return; + + debugfs_create_file("cmd", 0600, ec_debugfs_dir, NULL, + &ec_debugfs_genops); +} + static int olpc_ec_suspend(void) { return olpc_ec_mask_write(ec_wakeup_mask); @@ -372,6 +468,7 @@ static int __init olpc_init(void) } register_syscore_ops(&olpc_syscore_ops); + setup_debugfs(); return 0; } -- cgit v1.2.3 From 136d249ef7dbf0fefa292082cc40be1ea864cbd6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 21 Mar 2012 22:58:08 -0400 Subject: x86/ioapic: Add io_apic_ops driver layer to allow interception Xen dom0 needs to paravirtualize IO operations to the IO APIC, so add a io_apic_ops for it to intercept. Do this as ops structure because there's at least some chance that another paravirtualized environment may want to intercept these. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Suresh Siddha Cc: jwboyer@redhat.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/1332385090-18056-2-git-send-email-konrad.wilk@oracle.com [ Made all the affected code easier on the eyes ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 9 +++++++ arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 690d1cc9a877..2c4943de5150 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -21,6 +21,15 @@ #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) +struct io_apic_ops { + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); +}; + +void __init set_io_apic_ops(const struct io_apic_ops *); + /* * The structure of the IO-APIC: */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2c428c5d7ca3..e88300d8e80a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -64,9 +64,28 @@ #include #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) +static void __init __ioapic_init_mappings(void); + +static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); +static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); + +static struct io_apic_ops io_apic_ops = { + .init = __ioapic_init_mappings, + .read = __io_apic_read, + .write = __io_apic_write, + .modify = __io_apic_modify, +}; + +void __init set_io_apic_ops(const struct io_apic_ops *ops) +{ + io_apic_ops = *ops; +} + /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -294,6 +313,22 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + return io_apic_ops.read(apic, reg); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.write(apic, reg, value); +} + +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.modify(apic, reg, value); +} + + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -314,16 +349,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -334,7 +370,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -377,6 +413,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; } @@ -384,9 +421,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; } @@ -396,8 +435,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { union entry_union eu = {{0, 0}}; @@ -409,6 +447,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -435,8 +474,7 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int -__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -521,6 +559,7 @@ static void io_apic_sync(struct irq_pin_list *entry) * a dummy read from the IO-APIC */ struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } @@ -3893,6 +3932,11 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) } void __init ioapic_and_gsi_init(void) +{ + io_apic_ops.init(); +} + +static void __init __ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; -- cgit v1.2.3 From baa676fcf8d555269bd0a5a2496782beee55824d Mon Sep 17 00:00:00 2001 From: Andrzej Pietrasiewicz Date: Tue, 27 Mar 2012 14:28:18 +0200 Subject: X86 & IA64: adapt for dma_map_ops changes Adapt core x86 and IA64 architecture code for dma_map_ops changes: replace alloc/free_coherent with generic alloc/free methods. Signed-off-by: Andrzej Pietrasiewicz Acked-by: Kyungmin Park [removed swiotlb related changes and replaced it with wrappers, merged with IA64 patch to avoid inter-patch dependences in intel-iommu code] Signed-off-by: Marek Szyprowski Reviewed-by: Arnd Bergmann Acked-by: Tony Luck --- arch/ia64/hp/common/sba_iommu.c | 11 ++++++----- arch/ia64/include/asm/dma-mapping.h | 18 ++++++++++++------ arch/ia64/kernel/pci-swiotlb.c | 14 +++++++++++--- arch/ia64/sn/pci/pci_dma.c | 9 +++++---- arch/x86/include/asm/dma-mapping.h | 26 ++++++++++++++++---------- arch/x86/kernel/amd_gart_64.c | 11 ++++++----- arch/x86/kernel/pci-calgary_64.c | 9 +++++---- arch/x86/kernel/pci-dma.c | 3 ++- arch/x86/kernel/pci-nommu.c | 6 +++--- arch/x86/kernel/pci-swiotlb.c | 17 +++++++++++++---- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- drivers/iommu/amd_iommu.c | 10 ++++++---- drivers/iommu/intel-iommu.c | 9 +++++---- drivers/xen/swiotlb-xen.c | 5 +++-- include/xen/swiotlb-xen.h | 6 ++++-- 15 files changed, 99 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index f5f4ef149aac..e5eb9c4b2198 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1130,7 +1130,8 @@ void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, * See Documentation/DMA-API-HOWTO.txt */ static void * -sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) +sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flags, struct dma_attrs *attrs) { struct ioc *ioc; void *addr; @@ -1192,8 +1193,8 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp * * See Documentation/DMA-API-HOWTO.txt */ -static void sba_free_coherent (struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +static void sba_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { sba_unmap_single_attrs(dev, dma_handle, size, 0, NULL); free_pages((unsigned long) vaddr, get_order(size)); @@ -2213,8 +2214,8 @@ sba_page_override(char *str) __setup("sbapagesize=",sba_page_override); struct dma_map_ops sba_dma_ops = { - .alloc_coherent = sba_alloc_coherent, - .free_coherent = sba_free_coherent, + .alloc = sba_alloc_coherent, + .free = sba_free_coherent, .map_page = sba_map_page, .unmap_page = sba_unmap_page, .map_sg = sba_map_sg_attrs, diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 4336d080b241..4f5e8148440d 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -23,23 +23,29 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t, extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, enum dma_data_direction); -static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *daddr, gfp_t gfp) +#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) + +static inline void *dma_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *daddr, gfp_t gfp, + struct dma_attrs *attrs) { struct dma_map_ops *ops = platform_dma_get_ops(dev); void *caddr; - caddr = ops->alloc_coherent(dev, size, daddr, gfp); + caddr = ops->alloc(dev, size, daddr, gfp, attrs); debug_dma_alloc_coherent(dev, size, *daddr, caddr); return caddr; } -static inline void dma_free_coherent(struct device *dev, size_t size, - void *caddr, dma_addr_t daddr) +#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *caddr, dma_addr_t daddr, + struct dma_attrs *attrs) { struct dma_map_ops *ops = platform_dma_get_ops(dev); debug_dma_free_coherent(dev, size, caddr, daddr); - ops->free_coherent(dev, size, caddr, daddr); + ops->free(dev, size, caddr, daddr, attrs); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c index d9485d952ed0..939260aeac98 100644 --- a/arch/ia64/kernel/pci-swiotlb.c +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -15,16 +15,24 @@ int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { if (dev->coherent_dma_mask != DMA_BIT_MASK(64)) gfp |= GFP_DMA; return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); } +static void ia64_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + struct dma_map_ops swiotlb_dma_ops = { - .alloc_coherent = ia64_swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, + .alloc = ia64_swiotlb_alloc_coherent, + .free = ia64_swiotlb_free_coherent, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index a9d310de57da..3290d6e00c31 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -76,7 +76,8 @@ EXPORT_SYMBOL(sn_dma_set_mask); * more information. */ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, gfp_t flags) + dma_addr_t * dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *cpuaddr; unsigned long phys_addr; @@ -137,7 +138,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, * any associated IOMMU mappings. */ static void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) + dma_addr_t dma_handle, struct dma_attrs *attrs) { struct pci_dev *pdev = to_pci_dev(dev); struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); @@ -466,8 +467,8 @@ int sn_pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size) } static struct dma_map_ops sn_dma_ops = { - .alloc_coherent = sn_dma_alloc_coherent, - .free_coherent = sn_dma_free_coherent, + .alloc = sn_dma_alloc_coherent, + .free = sn_dma_free_coherent, .map_page = sn_dma_map_page, .unmap_page = sn_dma_unmap_page, .map_sg = sn_dma_map_sg, diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ed3065fd6314..4b4331d71935 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -59,7 +59,8 @@ extern int dma_supported(struct device *hwdev, u64 mask); extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag); + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs); static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { @@ -111,9 +112,11 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) return gfp; } +#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) + static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) +dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) { struct dma_map_ops *ops = get_dma_ops(dev); void *memory; @@ -129,18 +132,21 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (!is_device_dma_capable(dev)) return NULL; - if (!ops->alloc_coherent) + if (!ops->alloc) return NULL; - memory = ops->alloc_coherent(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp)); + memory = ops->alloc(dev, size, dma_handle, + dma_alloc_coherent_gfp_flags(dev, gfp), attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, memory); return memory; } -static inline void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) +#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs) { struct dma_map_ops *ops = get_dma_ops(dev); @@ -150,8 +156,8 @@ static inline void dma_free_coherent(struct device *dev, size_t size, return; debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free_coherent) - ops->free_coherent(dev, size, vaddr, bus); + if (ops->free) + ops->free(dev, size, vaddr, bus, attrs); } #endif diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b1e7c7f7a0af..e66311200cbd 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -477,7 +477,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag) + gfp_t flag, struct dma_attrs *attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, } __free_pages(page, get_order(size)); } else - return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + return dma_generic_alloc_coherent(dev, size, dma_addr, flag, + attrs); return NULL; } @@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); free_pages((unsigned long)vaddr, get_order(size)); @@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, .map_page = gart_map_page, .unmap_page = gart_unmap_page, - .alloc_coherent = gart_alloc_coherent, - .free_coherent = gart_free_coherent, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, .mapping_error = gart_mapping_error, }; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b58345..07b587c5a2d2 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -431,7 +431,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { void *ret = NULL; dma_addr_t mapping; @@ -464,7 +464,8 @@ error: } static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); @@ -477,8 +478,8 @@ static void calgary_free_coherent(struct device *dev, size_t size, } static struct dma_map_ops calgary_dma_ops = { - .alloc_coherent = calgary_alloc_coherent, - .free_coherent = calgary_free_coherent, + .alloc = calgary_alloc_coherent, + .free = calgary_free_coherent, .map_sg = calgary_map_sg, .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21ea..75e1cc19e630 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void) } } void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long dma_mask; struct page *page; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3af4af810c07..f96050685b46 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, } static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { free_pages((unsigned long)vaddr, get_order(size)); } @@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev, } struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, + .alloc = dma_generic_alloc_coherent, + .free = nommu_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 8f972cbddef0..6c483ba98b9c 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -15,21 +15,30 @@ int swiotlb __read_mostly; static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *vaddr; - vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); + vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, + attrs); if (vaddr) return vaddr; return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } +static void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + static struct dma_map_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, - .alloc_coherent = x86_swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, + .alloc = x86_swiotlb_alloc_coherent, + .free = x86_swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b480d4207a4c..967633ad98c4 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -12,8 +12,8 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { .mapping_error = xen_swiotlb_dma_mapping_error, - .alloc_coherent = xen_swiotlb_alloc_coherent, - .free_coherent = xen_swiotlb_free_coherent, + .alloc = xen_swiotlb_alloc_coherent, + .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, .sync_single_for_device = xen_swiotlb_sync_single_for_device, .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index f75e0608be5b..daa333f97b78 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2707,7 +2707,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, * The exported alloc_coherent function for dma_ops. */ static void *alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long flags; void *virt_addr; @@ -2765,7 +2766,8 @@ out_free: * The exported free_coherent function for dma_ops. */ static void free_coherent(struct device *dev, size_t size, - void *virt_addr, dma_addr_t dma_addr) + void *virt_addr, dma_addr_t dma_addr, + struct dma_attrs *attrs) { unsigned long flags; struct protection_domain *domain; @@ -2846,8 +2848,8 @@ static void prealloc_protection_domains(void) } static struct dma_map_ops amd_iommu_dma_ops = { - .alloc_coherent = alloc_coherent, - .free_coherent = free_coherent, + .alloc = alloc_coherent, + .free = free_coherent, .map_page = map_page, .unmap_page = unmap_page, .map_sg = map_sg, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c9c6053198d4..e39bfdc055c3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2938,7 +2938,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, } static void *intel_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *vaddr; int order; @@ -2970,7 +2971,7 @@ static void *intel_alloc_coherent(struct device *hwdev, size_t size, } static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dma_handle) + dma_addr_t dma_handle, struct dma_attrs *attrs) { int order; @@ -3115,8 +3116,8 @@ static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr) } struct dma_map_ops intel_dma_ops = { - .alloc_coherent = intel_alloc_coherent, - .free_coherent = intel_free_coherent, + .alloc = intel_alloc_coherent, + .free = intel_free_coherent, .map_sg = intel_map_sg, .unmap_sg = intel_unmap_sg, .map_page = intel_map_page, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 19e6a2041371..1afb4fba11b4 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -204,7 +204,8 @@ error: void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *ret; int order = get_order(size); @@ -253,7 +254,7 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr) + dma_addr_t dev_addr, struct dma_attrs *attrs) { int order = get_order(size); phys_addr_t phys; diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 2ea2fdc79c16..4f4d449f00f6 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -7,11 +7,13 @@ extern void xen_swiotlb_init(int verbose); extern void *xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags); + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs); extern void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle); + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs); extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, -- cgit v1.2.3 From 8f0750f19789cf352d7e24a6cc50f2ab1b4f1372 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 24 Mar 2012 10:52:50 +0300 Subject: x86, tls: Off by one limit check These are used as offsets into an array of GDT_ENTRY_TLS_ENTRIES members so GDT_ENTRY_TLS_ENTRIES is one past the end of the array. Signed-off-by: Dan Carpenter Link: http://lkml.kernel.org/r/20120324075250.GA28258@elgon.mountain Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e70..bcfec2d23769 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -163,7 +163,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset, { const struct desc_struct *tls; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; @@ -198,7 +198,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; - if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || (count % sizeof(struct user_desc)) != 0) return -EINVAL; -- cgit v1.2.3 From f05e798ad4c09255f590f5b2c00a7ca6c172f983 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:11:12 +0100 Subject: Disintegrate asm/system.h for X86 Disintegrate asm/system.h for X86. Signed-off-by: David Howells Acked-by: H. Peter Anvin cc: x86@kernel.org --- arch/x86/ia32/ia32_aout.c | 1 - arch/x86/include/asm/apic.h | 1 - arch/x86/include/asm/auxvec.h | 7 + arch/x86/include/asm/barrier.h | 116 +++++++ arch/x86/include/asm/bug.h | 4 + arch/x86/include/asm/cacheflush.h | 1 + arch/x86/include/asm/elf.h | 1 - arch/x86/include/asm/exec.h | 1 + arch/x86/include/asm/futex.h | 1 - arch/x86/include/asm/i387.h | 1 - arch/x86/include/asm/local.h | 1 - arch/x86/include/asm/mc146818rtc.h | 1 - arch/x86/include/asm/processor.h | 31 +- arch/x86/include/asm/segment.h | 58 +++- arch/x86/include/asm/special_insns.h | 199 ++++++++++++ arch/x86/include/asm/stackprotector.h | 1 - arch/x86/include/asm/switch_to.h | 129 ++++++++ arch/x86/include/asm/system.h | 527 +------------------------------ arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/include/asm/virtext.h | 1 - arch/x86/kernel/acpi/cstate.c | 1 + arch/x86/kernel/apm_32.c | 1 - arch/x86/kernel/cpu/mcheck/p5.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - arch/x86/kernel/cpu/mcheck/winchip.c | 1 - arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpuid.c | 1 - arch/x86/kernel/i8259.c | 1 - arch/x86/kernel/irqinit.c | 1 - arch/x86/kernel/kgdb.c | 1 - arch/x86/kernel/ldt.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/mca_32.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/msr.c | 1 - arch/x86/kernel/paravirt.c | 1 + arch/x86/kernel/pci-calgary_64.c | 1 - arch/x86/kernel/process.c | 1 - arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/kernel/tce_64.c | 1 + arch/x86/kernel/tls.c | 1 - arch/x86/kernel/traps.c | 1 - arch/x86/mm/init.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/power/hibernate_32.c | 1 - 50 files changed, 554 insertions(+), 562 deletions(-) create mode 100644 arch/x86/include/asm/barrier.h create mode 100644 arch/x86/include/asm/exec.h create mode 100644 arch/x86/include/asm/special_insns.h create mode 100644 arch/x86/include/asm/switch_to.h (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 4c2e59a420b9..d511d951a052 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718c..4b2caeefe1a2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #define ARCH_APICTIMER_STOPS_ON_C3 1 diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h index 1316b4c35425..77203ac352de 100644 --- a/arch/x86/include/asm/auxvec.h +++ b/arch/x86/include/asm/auxvec.h @@ -9,4 +9,11 @@ #endif #define AT_SYSINFO_EHDR 33 +/* entries in ARCH_DLINFO: */ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) +# define AT_VECTOR_SIZE_ARCH 2 +#else /* else it's non-compat x86-64 */ +# define AT_VECTOR_SIZE_ARCH 1 +#endif + #endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h new file mode 100644 index 000000000000..c6cd358a1eec --- /dev/null +++ b/arch/x86/include/asm/barrier.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_BARRIER_H +#define _ASM_X86_BARRIER_H + +#include +#include + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ + +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f654d1bb17fb..11e1152222d0 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -36,4 +36,8 @@ do { \ #endif /* !CONFIG_BUG */ #include + + +extern void show_regs_common(void); + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 4e12668711e5..9863ee3747da 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -3,6 +3,7 @@ /* Caches aren't brain-dead on the intel. */ #include +#include #ifdef CONFIG_X86_PAT /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df30d0f..f27f79abe021 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -84,7 +84,6 @@ extern unsigned int vdso_enabled; (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include -#include #ifdef CONFIG_X86_32 #include diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h new file mode 100644 index 000000000000..54c2e1db274a --- /dev/null +++ b/arch/x86/include/asm/exec.h @@ -0,0 +1 @@ +/* define arch_align_stack() here */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index d09bb03653f0..71ecbcba1a4e 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,7 +9,6 @@ #include #include #include -#include #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ asm volatile("1:\t" insn "\n" \ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7ce0798b1b26..257d9cca214f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -14,7 +14,6 @@ #include #include -#include struct pt_regs; struct user_i387_struct; diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 9cdae5d47e8f..c8bed0da434a 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -3,7 +3,6 @@ #include -#include #include #include diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0e8e85bb7c51..d354fb781c57 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -5,7 +5,6 @@ #define _ASM_X86_MC146818RTC_H #include -#include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 95da14f7ee85..78e30ea492b2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -14,13 +14,13 @@ struct mm_struct; #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -29,6 +29,15 @@ struct mm_struct; #include #include #include +#include + +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 #define HBP_NUM 4 /* @@ -1022,4 +1031,24 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ +#ifdef CONFIG_X86_32 +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#endif + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); +bool set_pm_idle_to_default(void); + +void stop_this_cpu(void *dummy); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5e641715c3fe..165466233ab0 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -212,7 +212,61 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; -#endif -#endif + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +#endif /* !__ASSEMBLY__ */ +#endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h new file mode 100644 index 000000000000..41fc93a2e225 --- /dev/null +++ b/arch/x86/include/asm/special_insns.h @@ -0,0 +1,199 @@ +#ifndef _ASM_X86_SPECIAL_INSNS_H +#define _ASM_X86_SPECIAL_INSNS_H + + +#ifdef __KERNEL__ + +static inline void native_clts(void) +{ + asm volatile("clts"); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist. In x86_64, a cr4 always + * exists, so it will never fail. */ +#ifdef CONFIG_X86_32 + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b, 2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); +#else + val = native_read_cr4(); +#endif + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long native_read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void native_write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} +#endif + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +extern void native_load_gs_index(unsigned); + +#ifdef CONFIG_PARAVIRT +#include +#else + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + +#endif + +/* Clear the 'TS' bit */ +static inline void clts(void) +{ + native_clts(); +} + +#endif/* CONFIG_PARAVIRT */ + +#define stts() write_cr0(read_cr0() | X86_CR0_TS) + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 157517763565..b5d9533d2c38 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h new file mode 100644 index 000000000000..4ec45b3abba1 --- /dev/null +++ b/arch/x86/include/asm/switch_to.h @@ -0,0 +1,129 @@ +#ifndef _ASM_X86_SWITCH_TO_H +#define _ASM_X86_SWITCH_TO_H + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +struct tss_struct; +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (stack_canary.canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +#else /* CONFIG_X86_32 */ + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (irq_stack_union.stack_canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (current_task) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) + +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2d2f01ce6dcb..0d84f9e42fde 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -1,523 +1,6 @@ -#ifndef _ASM_X86_SYSTEM_H -#define _ASM_X86_SYSTEM_H - -#include -#include -#include +/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ +#include #include -#include - -#include -#include - -/* entries in ARCH_DLINFO: */ -#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) -# define AT_VECTOR_SIZE_ARCH 2 -#else /* else it's non-compat x86-64 */ -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); -struct tss_struct; -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss); -extern void show_regs_common(void); - -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ -#define switch_to(prev, next, last) \ -do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ -} while (0) - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#else - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* Save restore flags to clear handle leaking NT */ -#define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) -#endif - -#ifdef __KERNEL__ - -extern void native_load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg, value) \ -do { \ - unsigned short __val = (value); \ - \ - asm volatile(" \n" \ - "1: movl %k0,%%" #seg " \n" \ - \ - ".section .fixup,\"ax\" \n" \ - "2: xorl %k0,%k0 \n" \ - " jmp 1b \n" \ - ".previous \n" \ - \ - _ASM_EXTABLE(1b, 2b) \ - \ - : "+r" (__val) : : "memory"); \ -} while (0) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm("mov %%" #seg ",%0":"=r" (value) : : "memory") - -/* - * x86_32 user gs accessors. - */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ -#endif /* X86_32 */ - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - -static inline void native_clts(void) -{ - asm volatile("clts"); -} - -/* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization - */ -static unsigned long __force_order; - -static inline unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr2(unsigned long val) -{ - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr3(unsigned long val) -{ - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ -#ifdef CONFIG_X86_32 - asm volatile("1: mov %%cr4, %0\n" - "2:\n" - _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); -#else - val = native_read_cr4(); -#endif - return val; -} - -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} - -#ifdef CONFIG_X86_64 -static inline unsigned long native_read_cr8(void) -{ - unsigned long cr8; - asm volatile("movq %%cr8,%0" : "=r" (cr8)); - return cr8; -} - -static inline void native_write_cr8(unsigned long val) -{ - asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); -} -#endif - -static inline void native_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -#ifdef CONFIG_PARAVIRT -#include -#else - -static inline unsigned long read_cr0(void) -{ - return native_read_cr0(); -} - -static inline void write_cr0(unsigned long x) -{ - native_write_cr0(x); -} - -static inline unsigned long read_cr2(void) -{ - return native_read_cr2(); -} - -static inline void write_cr2(unsigned long x) -{ - native_write_cr2(x); -} - -static inline unsigned long read_cr3(void) -{ - return native_read_cr3(); -} - -static inline void write_cr3(unsigned long x) -{ - native_write_cr3(x); -} - -static inline unsigned long read_cr4(void) -{ - return native_read_cr4(); -} - -static inline unsigned long read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - -static inline void write_cr4(unsigned long x) -{ - native_write_cr4(x); -} - -static inline void wbinvd(void) -{ - native_wbinvd(); -} - -#ifdef CONFIG_X86_64 - -static inline unsigned long read_cr8(void) -{ - return native_read_cr8(); -} - -static inline void write_cr8(unsigned long x) -{ - native_write_cr8(x); -} - -static inline void load_gs_index(unsigned selector) -{ - native_load_gs_index(selector); -} - -#endif - -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - -#endif/* CONFIG_PARAVIRT */ - -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - -#endif /* __KERNEL__ */ - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); -} - -#define nop() asm volatile ("nop") - -void disable_hlt(void); -void enable_hlt(void); - -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - -void default_idle(void); -bool set_pm_idle_to_default(void); - -void stop_this_cpu(void *dummy); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -/* - * We handle most unaligned accesses in hardware. On the other hand - * unaligned DMA can be quite expensive on some Nehalem processors. - * - * Based on this we disable the IP header alignment in network drivers. - */ -#define NET_IP_ALIGN 0 -#endif /* _ASM_X86_SYSTEM_H */ +#include +#include +#include diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 169be8938b96..c0e108e08079 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,7 +5,7 @@ #include #include -#include +#include #ifdef CONFIG_PARAVIRT #include diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index e0f9aa16358b..5da71c27cc59 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -16,7 +16,6 @@ #define _ASM_X86_VIRTEX_H #include -#include #include #include diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f50e7fb2a201..d2b7f27781bc 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d56931a15b3..459e78cbf61e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -231,7 +231,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9bc..2d5454cd2c4f 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 67bb17a37a0a..47a1870279aa 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f565974..2d7998fb628c 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 97b26356e9ee..75772ae6c65f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f2..39472dd2323f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,7 +43,6 @@ #include #include -#include static struct class *cpuid_class; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 610485223bdb..36d1853e91af 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbce..99b85b423bbf 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771acad..4425a12ece43 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ea697263b373..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43ba5d3b..5b19e4d78b00 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 177183cbb6ae..7eb1e2b97827 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 925179f871de..f21fd94ac897 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 96356762a51d..eb113693f043 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,6 @@ #include #include -#include static struct class *msr_class; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ada2f99388dd..2b26485f0c11 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,6 +37,7 @@ #include #include #include +#include /* nop stub */ void _paravirt_nop(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b58345..6ac5782f4d6b 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1f..9b24f36eb55f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfaf..aae4f4bbbe88 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include @@ -59,6 +58,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5aa..61270e8d428a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,7 +40,6 @@ #include #include -#include #include #include #include @@ -53,6 +52,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 78f05e438be5..8a634c887652 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..8cbeb7209c3e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -90,7 +90,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 9e540fee7009..ab40954e113e 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -34,6 +34,7 @@ #include #include #include +#include /* flush a tce at 'tceaddr' to main memory */ static inline void flush_tce(void* tceaddr) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e70..73920e4c6dc5 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec61d4c1b93b..860f126ca233 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf6570d64..4f0cec7e4ffb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8663f6c47ccb..575d86f85ce4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a0309db33..fc18be0f6f29 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac718499256..a69bcb8c7621 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 3769079874d8..74202c1910cd 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 49a7f04a4b9d45cd794741ce3d5d66524b37bdd0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Move all declarations of free_initmem() to linux/mm.h Move all declarations of free_initmem() to linux/mm.h so that there's only one and it's used by everything. Signed-off-by: David Howells cc: linux-c6x-dev@linux-c6x.org cc: microblaze-uclinux@itee.uq.edu.au cc: linux-sh@vger.kernel.org cc: sparclinux@vger.kernel.org cc: x86@kernel.org cc: linux-mm@kvack.org --- arch/c6x/include/asm/system.h | 1 - arch/frv/include/asm/system.h | 2 -- arch/microblaze/include/asm/system.h | 1 - arch/sh/include/asm/system.h | 1 - arch/sparc/mm/init_64.h | 2 -- arch/x86/include/asm/page_types.h | 1 - include/linux/mm.h | 2 ++ init/main.c | 1 - 8 files changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h index ccc4f86d16c5..0d84f9e42fde 100644 --- a/arch/c6x/include/asm/system.h +++ b/arch/c6x/include/asm/system.h @@ -4,4 +4,3 @@ #include #include #include -extern void free_initmem(void); diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h index 5c707a235403..659bcdb44eca 100644 --- a/arch/frv/include/asm/system.h +++ b/arch/frv/include/asm/system.h @@ -1,6 +1,4 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ #include #include #include #include -extern void free_initmem(void); diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h index ccc4f86d16c5..0d84f9e42fde 100644 --- a/arch/microblaze/include/asm/system.h +++ b/arch/microblaze/include/asm/system.h @@ -4,4 +4,3 @@ #include #include #include -extern void free_initmem(void); diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h index e2042aa32f2c..04268aa3b3e5 100644 --- a/arch/sh/include/asm/system.h +++ b/arch/sh/include/asm/system.h @@ -6,4 +6,3 @@ #include #include #include -void free_initmem(void); diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h index 77d1b313e344..3e1ac8b96cae 100644 --- a/arch/sparc/mm/init_64.h +++ b/arch/sparc/mm/init_64.h @@ -36,8 +36,6 @@ extern unsigned long kern_locked_tte_data; extern void prom_world(int enter); -extern void free_initmem(void); - #ifdef CONFIG_SPARSEMEM_VMEMMAP #define VMEMMAP_CHUNK_SHIFT 22 #define VMEMMAP_CHUNK (1UL << VMEMMAP_CHUNK_SHIFT) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index bce688d54c12..e21fdd10479f 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -55,7 +55,6 @@ extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(void); -extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7330742e7973..69f6d7b7eb01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1257,6 +1257,8 @@ static inline void pgtable_page_dtor(struct page *page) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void free_initmem(void); + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its diff --git a/init/main.c b/init/main.c index c24805c824b9..44c9754e2a5c 100644 --- a/init/main.c +++ b/init/main.c @@ -87,7 +87,6 @@ extern void mca_init(void); extern void sbus_init(void); extern void prio_tree_init(void); extern void radix_tree_init(void); -extern void free_initmem(void); #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif -- cgit v1.2.3 From 141124c02059eee9dbc5c86ea797b1ca888e77f7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Delete all instances of asm/system.h Delete all instances of asm/system.h as they should be redundant by this point. Signed-off-by: David Howells --- arch/alpha/include/asm/system.h | 5 ----- arch/avr32/include/asm/system.h | 6 ------ arch/blackfin/include/asm/system.h | 5 ----- arch/c6x/include/asm/system.h | 6 ------ arch/cris/include/asm/system.h | 5 ----- arch/frv/include/asm/system.h | 4 ---- arch/h8300/include/asm/system.h | 5 ----- arch/hexagon/include/asm/system.h | 5 ----- arch/ia64/include/asm/system.h | 4 ---- arch/m32r/include/asm/system.h | 6 ------ arch/m68k/include/asm/system.h | 5 ----- arch/microblaze/include/asm/system.h | 6 ------ arch/mips/include/asm/system.h | 5 ----- arch/mn10300/include/asm/system.h | 5 ----- arch/openrisc/include/asm/system.h | 5 ----- arch/parisc/include/asm/system.h | 6 ------ arch/powerpc/include/asm/system.h | 6 ------ arch/s390/include/asm/system.h | 7 ------- arch/score/include/asm/system.h | 5 ----- arch/sh/include/asm/system.h | 8 -------- arch/sparc/include/asm/system.h | 6 ------ arch/tile/include/asm/system.h | 4 ---- arch/unicore32/include/asm/system.h | 5 ----- arch/x86/include/asm/system.h | 6 ------ arch/xtensa/include/asm/system.h | 5 ----- include/asm-generic/system.h | 5 ----- 26 files changed, 140 deletions(-) delete mode 100644 arch/alpha/include/asm/system.h delete mode 100644 arch/avr32/include/asm/system.h delete mode 100644 arch/blackfin/include/asm/system.h delete mode 100644 arch/c6x/include/asm/system.h delete mode 100644 arch/cris/include/asm/system.h delete mode 100644 arch/frv/include/asm/system.h delete mode 100644 arch/h8300/include/asm/system.h delete mode 100644 arch/hexagon/include/asm/system.h delete mode 100644 arch/ia64/include/asm/system.h delete mode 100644 arch/m32r/include/asm/system.h delete mode 100644 arch/m68k/include/asm/system.h delete mode 100644 arch/microblaze/include/asm/system.h delete mode 100644 arch/mips/include/asm/system.h delete mode 100644 arch/mn10300/include/asm/system.h delete mode 100644 arch/openrisc/include/asm/system.h delete mode 100644 arch/parisc/include/asm/system.h delete mode 100644 arch/powerpc/include/asm/system.h delete mode 100644 arch/s390/include/asm/system.h delete mode 100644 arch/score/include/asm/system.h delete mode 100644 arch/sh/include/asm/system.h delete mode 100644 arch/sparc/include/asm/system.h delete mode 100644 arch/tile/include/asm/system.h delete mode 100644 arch/unicore32/include/asm/system.h delete mode 100644 arch/x86/include/asm/system.h delete mode 100644 arch/xtensa/include/asm/system.h delete mode 100644 include/asm-generic/system.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/system.h b/arch/alpha/include/asm/system.h deleted file mode 100644 index c7270dcff321..000000000000 --- a/arch/alpha/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h deleted file mode 100644 index 0d84f9e42fde..000000000000 --- a/arch/avr32/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/blackfin/include/asm/system.h b/arch/blackfin/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/blackfin/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h deleted file mode 100644 index 0d84f9e42fde..000000000000 --- a/arch/c6x/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/cris/include/asm/system.h b/arch/cris/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/cris/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h deleted file mode 100644 index 659bcdb44eca..000000000000 --- a/arch/frv/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -#include -#include -#include -#include diff --git a/arch/h8300/include/asm/system.h b/arch/h8300/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/h8300/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/hexagon/include/asm/system.h b/arch/hexagon/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/hexagon/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h deleted file mode 100644 index 5b190b48fcd0..000000000000 --- a/arch/ia64/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include diff --git a/arch/m32r/include/asm/system.h b/arch/m32r/include/asm/system.h deleted file mode 100644 index a55c384fdcf3..000000000000 --- a/arch/m32r/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/m68k/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h deleted file mode 100644 index 0d84f9e42fde..000000000000 --- a/arch/microblaze/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/mips/include/asm/system.h b/arch/mips/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/mips/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/mn10300/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/openrisc/include/asm/system.h b/arch/openrisc/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/openrisc/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/parisc/include/asm/system.h b/arch/parisc/include/asm/system.h deleted file mode 100644 index fc2c1261ac1f..000000000000 --- a/arch/parisc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h deleted file mode 100644 index 502c1e0275af..000000000000 --- a/arch/powerpc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h deleted file mode 100644 index 641c72903277..000000000000 --- a/arch/s390/include/asm/system.h +++ /dev/null @@ -1,7 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include -#include diff --git a/arch/score/include/asm/system.h b/arch/score/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/score/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h deleted file mode 100644 index 04268aa3b3e5..000000000000 --- a/arch/sh/include/asm/system.h +++ /dev/null @@ -1,8 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include -#include -#include diff --git a/arch/sparc/include/asm/system.h b/arch/sparc/include/asm/system.h deleted file mode 100644 index ed532ba000b1..000000000000 --- a/arch/sparc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/tile/include/asm/system.h b/arch/tile/include/asm/system.h deleted file mode 100644 index 5b190b48fcd0..000000000000 --- a/arch/tile/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include diff --git a/arch/unicore32/include/asm/system.h b/arch/unicore32/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/unicore32/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h deleted file mode 100644 index 0d84f9e42fde..000000000000 --- a/arch/x86/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/xtensa/include/asm/system.h b/arch/xtensa/include/asm/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/arch/xtensa/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/include/asm-generic/system.h b/include/asm-generic/system.h deleted file mode 100644 index a7f40578587c..000000000000 --- a/include/asm-generic/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -- cgit v1.2.3 From 8abc3122aa02567bfe626cd13f4d34853c9b1225 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 27 Mar 2012 20:04:02 +0200 Subject: x86/apic/amd: Be more verbose about LVT offset assignments Add information about LVT offset assignments to better debug firmware bugs related to this. See following examples. # dmesg | grep -i 'offset\|ibs' LVT offset 0 assigned for vector 0xf9 [Firmware Bug]: cpu 0, try to use APIC500 (LVT offset 0) for vector 0x10400, but the register is already in use for vector 0xf9 on another cpu [Firmware Bug]: cpu 0, IBS interrupt offset 0 not available (MSRC001103A=0x0000000000000100) Failed to setup IBS, -22 In this case the BIOS assigns both offsets for MCE (0xf9) and IBS (0x400) vectors to offset 0, which is why the second APIC setup (IBS) failed. With correct setup you get: # dmesg | grep -i 'offset\|ibs' LVT offset 0 assigned for vector 0xf9 LVT offset 1 assigned for vector 0x400 IBS: LVT offset 1 assigned perf: AMD IBS detected (0x00000007) oprofile: AMD IBS detected (0x00000007) Note: The vector includes also the message type to handle also NMIs (0x400). In the firmware bug message the format is the same as of the APIC500 register and includes the mask bit (bit 16) in addition. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2eec05b6d1b8..11544d8f1e97 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -383,20 +383,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) static unsigned int reserve_eilvt_offset(int offset, unsigned int new) { - unsigned int rsvd; /* 0: uninitialized */ + unsigned int rsvd, vector; if (offset >= APIC_EILVT_NR_MAX) return ~0; - rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; + rsvd = atomic_read(&eilvt_offsets[offset]); do { - if (rsvd && - !eilvt_entry_is_changeable(rsvd, new)) + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); } while (rsvd != new); + rsvd &= ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + return new; } -- cgit v1.2.3 From 09c71bfd8384278c42f56380365940508194cec0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kdump x86: fix total mem size calculation for reservation crashkernel reservation need know the total memory size. Current get_total_mem simply use max_pfn - min_low_pfn. It is wrong because it will including memory holes in the middle. Especially for kvm guest with memory > 0xe0000000, there's below in qemu code: qemu split memory as below: if (ram_size >= 0xe0000000 ) { above_4g_mem_size = ram_size - 0xe0000000; below_4g_mem_size = 0xe0000000; } else { below_4g_mem_size = ram_size; } So for 4G mem guest, seabios will insert a 512M usable region beyond of 4G. Thus in above case max_pfn - min_low_pfn will be more than original memsize. Fixing this issue by using memblock_phys_mem_size() to get the total memsize. Signed-off-by: Dave Young Reviewed-by: WANG Cong Reviewed-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176a..ab77aae4ad9b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -509,15 +509,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -536,7 +527,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); -- cgit v1.2.3 From 5f054e31c63be774bf1ce252f20d56012a00f8a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Mar 2012 15:38:31 +1030 Subject: documentation: remove references to cpu_*_map. This has been obsolescent for a while, fix documentation and misc comments. Signed-off-by: Rusty Russell --- Documentation/cgroups/cpusets.txt | 2 +- Documentation/cpu-hotplug.txt | 22 +++++++++++----------- arch/alpha/kernel/smp.c | 2 +- arch/ia64/kernel/acpi.c | 2 +- arch/mips/cavium-octeon/smp.c | 2 +- arch/mips/pmc-sierra/yosemite/smp.c | 2 +- arch/mips/sibyte/bcm1480/smp.c | 2 +- arch/tile/kernel/setup.c | 2 +- arch/x86/xen/enlighten.c | 2 +- init/Kconfig | 4 ++-- kernel/cpuset.c | 10 +++++----- 11 files changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 5c51ed406d1d..cefd3d8bbd11 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -217,7 +217,7 @@ and name space for cpusets, with a minimum of additional kernel code. The cpus and mems files in the root (top_cpuset) cpuset are read-only. The cpus file automatically tracks the value of -cpu_online_map using a CPU hotplug notifier, and the mems file +cpu_online_mask using a CPU hotplug notifier, and the mems file automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., nodes with memory--using the cpuset_track_online_nodes() hook. diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index a20bfd415e41..66ef8f35613d 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -47,7 +47,7 @@ maxcpus=n Restrict boot time cpus to n. Say if you have 4 cpus, using other cpus later online, read FAQ's for more info. additional_cpus=n (*) Use this to limit hotpluggable cpus. This option sets - cpu_possible_map = cpu_present_map + additional_cpus + cpu_possible_mask = cpu_present_mask + additional_cpus cede_offline={"off","on"} Use this option to disable/enable putting offlined processors to an extended H_CEDE state on @@ -64,11 +64,11 @@ should only rely on this to count the # of cpus, but *MUST* not rely on the apicid values in those tables for disabled apics. In the event BIOS doesn't mark such hot-pluggable cpus as disabled entries, one could use this parameter "additional_cpus=x" to represent those cpus in the -cpu_possible_map. +cpu_possible_mask. possible_cpus=n [s390,x86_64] use this to set hotpluggable cpus. This option sets possible_cpus bits in - cpu_possible_map. Thus keeping the numbers of bits set + cpu_possible_mask. Thus keeping the numbers of bits set constant even if the machine gets rebooted. CPU maps and such @@ -76,7 +76,7 @@ CPU maps and such [More on cpumaps and primitive to manipulate, please check include/linux/cpumask.h that has more descriptive text.] -cpu_possible_map: Bitmap of possible CPUs that can ever be available in the +cpu_possible_mask: Bitmap of possible CPUs that can ever be available in the system. This is used to allocate some boot time memory for per_cpu variables that aren't designed to grow/shrink as CPUs are made available or removed. Once set during boot time discovery phase, the map is static, i.e no bits @@ -84,13 +84,13 @@ are added or removed anytime. Trimming it accurately for your system needs upfront can save some boot time memory. See below for how we use heuristics in x86_64 case to keep this under check. -cpu_online_map: Bitmap of all CPUs currently online. Its set in __cpu_up() +cpu_online_mask: Bitmap of all CPUs currently online. Its set in __cpu_up() after a cpu is available for kernel scheduling and ready to receive interrupts from devices. Its cleared when a cpu is brought down using __cpu_disable(), before which all OS services including interrupts are migrated to another target CPU. -cpu_present_map: Bitmap of CPUs currently present in the system. Not all +cpu_present_mask: Bitmap of CPUs currently present in the system. Not all of them may be online. When physical hotplug is processed by the relevant subsystem (e.g ACPI) can change and new bit either be added or removed from the map depending on the event is hot-add/hot-remove. There are currently @@ -99,22 +99,22 @@ at which time hotplug is disabled. You really dont need to manipulate any of the system cpu maps. They should be read-only for most use. When setting up per-cpu resources almost always use -cpu_possible_map/for_each_possible_cpu() to iterate. +cpu_possible_mask/for_each_possible_cpu() to iterate. Never use anything other than cpumask_t to represent bitmap of CPUs. #include - for_each_possible_cpu - Iterate over cpu_possible_map - for_each_online_cpu - Iterate over cpu_online_map - for_each_present_cpu - Iterate over cpu_present_map + for_each_possible_cpu - Iterate over cpu_possible_mask + for_each_online_cpu - Iterate over cpu_online_mask + for_each_present_cpu - Iterate over cpu_present_mask for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask. #include get_online_cpus() and put_online_cpus(): The above calls are used to inhibit cpu hotplug operations. While the -cpu_hotplug.refcount is non zero, the cpu_online_map will not change. +cpu_hotplug.refcount is non zero, the cpu_online_mask will not change. If you merely need to avoid cpus going away, you could also use preempt_disable() and preempt_enable() for those sections. Just remember the critical section cannot call any diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 4087a569b43b..50d438db1f6b 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -450,7 +450,7 @@ setup_smp(void) smp_num_probed = 1; } - printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_map = %lx\n", + printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n", smp_num_probed, cpumask_bits(cpu_present_mask)[0]); } diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index ac795d311f44..6f38b6120d96 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -839,7 +839,7 @@ static __init int setup_additional_cpus(char *s) early_param("additional_cpus", setup_additional_cpus); /* - * cpu_possible_map should be static, it cannot change as CPUs + * cpu_possible_mask should be static, it cannot change as CPUs * are onlined, or offlined. The reason is per-cpu data-structures * are allocated by some modules at init time, and dont expect to * do this dynamically on cpu arrival/departure. diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ef56d8a0215f..97e7ce9b50ed 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -78,7 +78,7 @@ static inline void octeon_send_ipi_mask(const struct cpumask *mask, } /** - * Detect available CPUs, populate cpu_possible_map + * Detect available CPUs, populate cpu_possible_mask */ static void octeon_smp_hotplug_setup(void) { diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index 00a0d2e90d04..b71fae231049 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -146,7 +146,7 @@ static void __cpuinit yos_boot_secondary(int cpu, struct task_struct *idle) } /* - * Detect available CPUs, populate cpu_possible_map before smp_init + * Detect available CPUs, populate cpu_possible_mask before smp_init * * We don't want to start the secondary CPU yet nor do we have a nice probing * feature in PMON so we just assume presence of the secondary core. diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 63d2211e6167..de88e22694a0 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -138,7 +138,7 @@ static void __cpuinit bcm1480_boot_secondary(int cpu, struct task_struct *idle) /* * Use CFE to find out how many CPUs are available, setting up - * cpu_possible_map and the logical/physical mappings. + * cpu_possible_mask and the logical/physical mappings. * XXXKW will the boot CPU ever not be physical 0? * * Common setup before any secondaries are started diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 5124093b2e1d..92a94f4920ad 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -1100,7 +1100,7 @@ EXPORT_SYMBOL(hash_for_home_map); /* * cpu_cacheable_map lists all the cpus whose caches the hypervisor can - * flush on our behalf. It is set to cpu_possible_map OR'ed with + * flush on our behalf. It is set to cpu_possible_mask OR'ed with * hash_for_home_map, and it is what should be passed to * hv_flush_remote() to flush all caches. Note that if there are * dedicated hypervisor driver tiles that have authorized use of their diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b132ade26f77..4f51bebac02c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -967,7 +967,7 @@ void xen_setup_shared_info(void) xen_setup_mfn_list_list(); } -/* This is called once we have the cpu_possible_map */ +/* This is called once we have the cpu_possible_mask */ void xen_setup_vcpu_info_placement(void) { int cpu; diff --git a/init/Kconfig b/init/Kconfig index 72f33faca44f..6cfd71d06463 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1414,8 +1414,8 @@ endif # MODULES config INIT_ALL_POSSIBLE bool help - Back when each arch used to define their own cpu_online_map and - cpu_possible_map, some of them chose to initialize cpu_possible_map + Back when each arch used to define their own cpu_online_mask and + cpu_possible_mask, some of them chose to initialize cpu_possible_mask with all 1s, and others with all 0s. When they were centralised, it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1010cc61931f..eedeebe64b1a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = { * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. + * return cpu_online_mask. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_mask. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. + * of cpu_online_mask. * * Call with callback_mutex held. */ @@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; int is_load_balanced; - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2149,7 +2149,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the + * subset of cpu_online_mask, even if this means going outside the * tasks cpuset. **/ -- cgit v1.2.3 From 99dd5497e5be4fe4194cad181d45fd6569a930db Mon Sep 17 00:00:00 2001 From: "Liu, Chuansheng" Date: Mon, 26 Mar 2012 07:11:50 +0000 Subject: x86: Preserve lazy irq disable semantics in fixup_irqs() The default irq_disable() sematics are to mark the interrupt disabled, but keep it unmasked. If the interrupt is delivered while marked disabled, the low level interrupt handler masks it and marks it pending. This is important for detecting wakeup interrupts during suspend and for edge type interrupts to avoid losing interrupts. fixup_irqs() moves the interrupts away from an offlined cpu. For certain interrupt types it needs to mask the interrupt line before changing the affinity. After affinity has changed the interrupt line is unmasked again, but only if it is not marked disabled. This breaks the lazy irq disable semantics and causes problems in suspend as the interrupt can be lost or wakeup functionality is broken. Check irqd_irq_masked() instead of irqd_irq_disabled() because irqd_irq_masked() is only set, when the core code actually masked the interrupt line. If it's not set, we unmask the interrupt and let the lazy irq disable logic deal with an eventually incoming interrupt. [ tglx: Massaged changelog and added a comment ] Signed-off-by: liu chuansheng Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/27240C0AC20F114CBF8149A2696CBE4A05DFB3@SHSMSX101.ccr.corp.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7943e0c21bde..3dafc6003b7c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -282,8 +282,13 @@ void fixup_irqs(void) else if (!(warned++)) set_affinity = 0; + /* + * We unmask if the irq was not marked masked by the + * core code. That respects the lazy irq disable + * behaviour. + */ if (!irqd_can_move_in_process_context(data) && - !irqd_irq_disabled(data) && chip->irq_unmask) + !irqd_irq_masked(data) && chip->irq_unmask) chip->irq_unmask(data); raw_spin_unlock(&desc->lock); -- cgit v1.2.3 From 1d24fb3684f347226747c6b11ea426b7b992694e Mon Sep 17 00:00:00 2001 From: "zhuangfeiran@ict.ac.cn" Date: Wed, 28 Mar 2012 23:27:00 +0000 Subject: x86 bpf_jit: fix a bug in emitting the 16-bit immediate operand of AND When K >= 0xFFFF0000, AND needs the two least significant bytes of K as its operand, but EMIT2() gives it the least significant byte of K and 0x2. EMIT() should be used here to replace EMIT2(). Signed-off-by: Feiran Zhuang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5671752f8d9c..5a5b6e4dd738 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -289,7 +289,7 @@ void bpf_jit_compile(struct sk_filter *fp) EMIT2(0x24, K & 0xFF); /* and imm8,%al */ } else if (K >= 0xFFFF0000) { EMIT2(0x66, 0x25); /* and imm16,%ax */ - EMIT2(K, 2); + EMIT(K, 2); } else { EMIT1_off32(0x25, K); /* and imm32,%eax */ } -- cgit v1.2.3 From 3751d3e85cf693e10e2c47c03c8caa65e171099b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 23 Mar 2012 09:35:05 -0500 Subject: x86,kgdb: Fix DEBUG_RODATA limitation using text_poke() There has long been a limitation using software breakpoints with a kernel compiled with CONFIG_DEBUG_RODATA going back to 2.6.26. For this particular patch, it will apply cleanly and has been tested all the way back to 2.6.36. The kprobes code uses the text_poke() function which accommodates writing a breakpoint into a read-only page. The x86 kgdb code can solve the problem similarly by overriding the default breakpoint set/remove routines and using text_poke() directly. The x86 kgdb code will first attempt to use the traditional probe_kernel_write(), and next try using a the text_poke() function. The break point install method is tracked such that the correct break point removal routine will get called later on. Cc: x86@kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: stable@vger.kernel.org # >= 2.6.36 Inspried-by: Masami Hiramatsu Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/misc/kgdbts.c | 17 -------------- include/linux/kgdb.h | 3 ++- 3 files changed, 62 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index fdc37b3d0ce3..b9bd9d8de665 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include @@ -742,6 +744,64 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + char opc[BREAK_INSTR_SIZE]; + + bpt->type = BP_BREAKPOINT; + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); +#ifdef CONFIG_DEBUG_RODATA + if (!err) + return err; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + return -EBUSY; + text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err) + return err; + if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) + return -EINVAL; + bpt->type = BP_POKE_BREAKPOINT; +#endif /* CONFIG_DEBUG_RODATA */ + return err; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ +#ifdef CONFIG_DEBUG_RODATA + int err; + char opc[BREAK_INSTR_SIZE]; + + if (bpt->type != BP_POKE_BREAKPOINT) + goto knl_write; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + goto knl_write; + text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) + goto knl_write; + return err; +knl_write: +#endif /* CONFIG_DEBUG_RODATA */ + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + struct kgdb_arch arch_kgdb_ops = { /* Breakpoint instruction: */ .gdb_bpt_instr = { 0xcc }, diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index d087456ba089..3aa9a969b373 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -968,22 +968,6 @@ static void run_singlestep_break_test(void) kgdbts_break_test(); } -static void test_debug_rodata(void) -{ -#ifdef CONFIG_DEBUG_RODATA - /* Until there is an api to write to read-only text segments, use - * HW breakpoints for the remainder of any tests, else print a - * failure message if hw breakpoints do not work. - */ - if (!(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT && hwbreaks_ok)) { - eprintk("kgdbts: HW breakpoints BROKEN, ending tests\n"); - return; - } - force_hwbrks = 1; - v1printk("kgdbts:Using HW breakpoints for SW breakpoint tests\n"); -#endif /* CONFIG_DEBUG_RODATA */ -} - static void kgdbts_run_tests(void) { char *ptr; @@ -1016,7 +1000,6 @@ static void kgdbts_run_tests(void) v1printk("kgdbts:RUN access write breakpoint test\n"); run_hw_break_test(0); } - test_debug_rodata(); /* required internal KGDB tests */ v1printk("kgdbts:RUN plant and detach test\n"); diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index e5d689c1d774..c4d2fc194ede 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -63,7 +63,8 @@ enum kgdb_bptype { BP_HARDWARE_BREAKPOINT, BP_WRITE_WATCHPOINT, BP_READ_WATCHPOINT, - BP_ACCESS_WATCHPOINT + BP_ACCESS_WATCHPOINT, + BP_POKE_BREAKPOINT, }; enum kgdb_bpstate { -- cgit v1.2.3 From f6365201d8a21fb347260f89d6e9b3e718d63c70 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 29 Mar 2012 14:49:17 -0700 Subject: x86: Remove the ancient and deprecated disable_hlt() and enable_hlt() facility The X86_32-only disable_hlt/enable_hlt mechanism was used by the 32-bit floppy driver. Its effect was to replace the use of the HLT instruction inside default_idle() with cpu_relax() - essentially it turned off the use of HLT. This workaround was commented in the code as: "disable hlt during certain critical i/o operations" "This halt magic was a workaround for ancient floppy DMA wreckage. It should be safe to remove." H. Peter Anvin additionally adds: "To the best of my knowledge, no-hlt only existed because of flaky power distributions on 386/486 systems which were sold to run DOS. Since DOS did no power management of any kind, including HLT, the power draw was fairly uniform; when exposed to the much hhigher noise levels you got when Linux used HLT caused some of these systems to fail. They were by far in the minority even back then." Alan Cox further says: "Also for the Cyrix 5510 which tended to go castors up if a HLT occurred during a DMA cycle and on a few other boxes HLT during DMA tended to go astray. Do we care ? I doubt it. The 5510 was pretty obscure, the 5520 fixed it, the 5530 is probably the oldest still in any kind of use." So, let's finally drop this. Signed-off-by: Len Brown Signed-off-by: Josh Boyer Signed-off-by: Andrew Morton Acked-by: "H. Peter Anvin" Acked-by: Alan Cox Cc: Stephen Hemminger Cc: Link: http://lkml.kernel.org/n/tip-3rhk9bzf0x9rljkv488tloib@git.kernel.org [ If anyone cares then alternative instruction patching could be used to replace HLT with a one-byte NOP instruction. Much simpler. ] Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 8 ------- arch/x86/include/asm/processor.h | 10 --------- arch/x86/kernel/process.c | 24 -------------------- drivers/block/floppy.c | 36 ------------------------------ 4 files changed, 78 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 0cad4803ffac..7c950d48d76e 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,14 +6,6 @@ be removed from this file. --------------------------- -What: x86 floppy disable_hlt -When: 2012 -Why: ancient workaround of dubious utility clutters the - code used by everybody else. -Who: Len Brown - ---------------------------- - What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle When: 2012 Why: This optional sub-feature of APM is of dubious reliability, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7284c9a6a0b5..4fa7dcceb6c0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -974,16 +974,6 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ -#ifdef CONFIG_X86_32 -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#endif - -void disable_hlt(void); -void enable_hlt(void); - void cpu_idle_wait(void); extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a33afaa5ddb7..1d92a5ab6e8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -362,34 +362,10 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); #endif -#ifdef CONFIG_X86_32 -/* - * This halt magic was a workaround for ancient floppy DMA - * wreckage. It should be safe to remove. - */ -static int hlt_counter; -void disable_hlt(void) -{ - hlt_counter++; -} -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - hlt_counter--; -} -EXPORT_SYMBOL(enable_hlt); - -static inline int hlt_use_halt(void) -{ - return (!hlt_counter && boot_cpu_data.hlt_works_ok); -} -#else static inline int hlt_use_halt(void) { return 1; } -#endif #ifndef CONFIG_SMP static inline void play_dead(void) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 76a08236430a..b0b00d70c166 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1030,37 +1030,6 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) return 0; } -static DEFINE_SPINLOCK(floppy_hlt_lock); -static int hlt_disabled; -static void floppy_disable_hlt(void) -{ - unsigned long flags; - - WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (!hlt_disabled) { - hlt_disabled = 1; -#ifdef HAVE_DISABLE_HLT - disable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - -static void floppy_enable_hlt(void) -{ - unsigned long flags; - - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (hlt_disabled) { - hlt_disabled = 0; -#ifdef HAVE_DISABLE_HLT - enable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - static void setup_DMA(void) { unsigned long f; @@ -1105,7 +1074,6 @@ static void setup_DMA(void) fd_enable_dma(); release_dma_lock(f); #endif - floppy_disable_hlt(); } static void show_floppy(void); @@ -1707,7 +1675,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) fd_disable_dma(); release_dma_lock(f); - floppy_enable_hlt(); do_floppy = NULL; if (fdc >= N_FDC || FDCS->address == -1) { /* we don't even know which FDC is the culprit */ @@ -1856,8 +1823,6 @@ static void floppy_shutdown(unsigned long data) show_floppy(); cancel_activity(); - floppy_enable_hlt(); - flags = claim_dma_lock(); fd_disable_dma(); release_dma_lock(flags); @@ -4508,7 +4473,6 @@ static void floppy_release_irq_and_dma(void) #if N_FDC > 1 set_dor(1, ~8, 0); #endif - floppy_enable_hlt(); if (floppy_track_buffer && max_buffer_sectors) { tmpsize = max_buffer_sectors * 1024; -- cgit v1.2.3 From 1a022e3f1be11730bd8747b1af96a0274bf6356e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 13 Mar 2012 19:55:09 +0100 Subject: idle, x86: Allow off-lined CPU to enter deeper C states Currently when a CPU is off-lined it enters either MWAIT-based idle or, if MWAIT is not desired or supported, HLT-based idle (which places the processor in C1 state). This patch allows processors without MWAIT support to stay in states deeper than C1. Signed-off-by: Boris Ostrovsky Signed-off-by: Len Brown --- arch/x86/kernel/smpboot.c | 4 +++- drivers/acpi/processor_idle.c | 31 +++++++++++++++++++++++++++++++ drivers/cpuidle/cpuidle.c | 28 ++++++++++++++++++++++++++++ include/linux/cpuidle.h | 5 +++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..93a2a0932b51 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1422,7 +1423,8 @@ void native_play_dead(void) tboot_shutdown(TB_SHUTDOWN_WFS); mwait_play_dead(); /* Only returns on failure */ - hlt_play_dead(); + if (cpuidle_play_dead()) + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 0e8e2de2ed3e..6b1d32a161ae 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -770,6 +770,35 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, return index; } + +/** + * acpi_idle_play_dead - enters an ACPI state for long-term idle (i.e. off-lining) + * @dev: the target CPU + * @index: the index of suggested state + */ +static int acpi_idle_play_dead(struct cpuidle_device *dev, int index) +{ + struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; + struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); + + ACPI_FLUSH_CPU_CACHE(); + + while (1) { + + if (cx->entry_method == ACPI_CSTATE_HALT) + halt(); + else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { + inb(cx->address); + /* See comment in acpi_idle_do_entry() */ + inl(acpi_gbl_FADT.xpm_timer_block.address); + } else + return -ENODEV; + } + + /* Never reached */ + return 0; +} + /** * acpi_idle_enter_simple - enters an ACPI state without BM handling * @dev: the target CPU @@ -1077,12 +1106,14 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) state->flags |= CPUIDLE_FLAG_TIME_VALID; state->enter = acpi_idle_enter_c1; + state->enter_dead = acpi_idle_play_dead; drv->safe_state_index = count; break; case ACPI_STATE_C2: state->flags |= CPUIDLE_FLAG_TIME_VALID; state->enter = acpi_idle_enter_simple; + state->enter_dead = acpi_idle_play_dead; drv->safe_state_index = count; break; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f7cab5e9c4d6..3e146b2ada4a 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -71,6 +71,34 @@ typedef int (*cpuidle_enter_t)(struct cpuidle_device *dev, static cpuidle_enter_t cpuidle_enter_ops; +/** + * cpuidle_play_dead - cpu off-lining + * + * Only returns in case of an error + */ +int cpuidle_play_dead(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_driver(); + int i, dead_state = -1; + int power_usage = -1; + + /* Find lowest-power state that supports long-term idle */ + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + if (s->power_usage < power_usage && s->enter_dead) { + power_usage = s->power_usage; + dead_state = i; + } + } + + if (dead_state != -1) + return drv->states[dead_state].enter_dead(dev, dead_state); + + return -ENODEV; +} + /** * cpuidle_idle_call - the main idle loop * diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index f3ebbba368b3..d557bcd0ada7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -51,6 +51,8 @@ struct cpuidle_state { int (*enter) (struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); + + int (*enter_dead) (struct cpuidle_device *dev, int index); }; /* Idle State Flags */ @@ -147,6 +149,8 @@ extern int cpuidle_wrap_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index, int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)); +extern int cpuidle_play_dead(void); + #else static inline void disable_cpuidle(void) { } static inline int cpuidle_idle_call(void) { return -ENODEV; } @@ -168,6 +172,7 @@ static inline int cpuidle_wrap_enter(struct cpuidle_device *dev, int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)) { return -ENODEV; } +static inline int cpuidle_play_dead(void) {return -ENODEV; } #endif -- cgit v1.2.3 From ac909ec308ce8d5177963c780564824d12bc3fa2 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Thu, 8 Mar 2012 13:33:24 -0800 Subject: ACPI: Fix use-after-free in acpi_map_lsapic When processor is being hot-added to the system, acpi_map_lsapic invokes ACPI _MAT method to find APIC ID and flags, verifies that returned structure is indeed ACPI's local APIC structure, and that flags contain MADT_ENABLED bit. Then saves APIC ID, frees structure - and accesses structure when computing arguments for acpi_register_lapic call. Which sometime leads to acpi_register_lapic call being made with second argument zero, failing to bring processor online with error 'Unable to map lapic to logical cpu number'. As lapic->lapic_flags & ACPI_MADT_ENABLED was already confirmed to be non-zero few lines above, we can just pass unconditional ACPI_MADT_ENABLED to the acpi_register_lapic. Signed-off-by: Petr Vandrovec Signed-off-by: Alok N Kataria Reviewed-by: Toshi Kani Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ce664f33ea8e..bbcc2c389ade 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -642,6 +642,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) kfree(buffer.pointer); buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; + lapic = NULL; if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) goto out; @@ -650,7 +651,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_tmp_map; cpumask_copy(tmp_map, cpu_present_mask); - acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); + acpi_register_lapic(physid, ACPI_MADT_ENABLED); /* * If mp_register_lapic successfully generates a new logical cpu -- cgit v1.2.3 From c0e9afc0da6cb0f11497e5ea83377b3c451450e0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 28 Mar 2012 11:51:17 -0700 Subject: x86: Use -mno-avx when available On gccs that support AVX it's a good idea to disable that too, similar to how SSE2, SSE1 etc. are already disabled. This prevents the compiler from generating AVX ever implicitely. No failure observed, just from review. [ hpa: Marking this for urgent and stable, simply because the patch will either have absolutely no effect *or* it will avoid potentially very hard to debug failures. ] Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1332960678-11879-1-git-send-email-andi@firstfloor.org Signed-off-by: H. Peter Anvin Cc: --- arch/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 968dbe24a255..41a7237606a3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -129,6 +129,7 @@ KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # prevent gcc from generating any FP code by mistake KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) -- cgit v1.2.3 From dba69d1092e291e257fb5673a3ad0e4c87878ebc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sun, 1 Apr 2012 13:53:36 -0300 Subject: x86, kvm: Call restore_sched_clock_state() only after %gs is initialized s2ram broke due to this KVM commit: b74f05d61b73 x86: kvmclock: abstract save/restore sched_clock_state restore_sched_clock_state() methods use percpu data, therefore they must run after %gs is initialized, but before mtrr_bp_restore() (due to lockstat using sched_clock). Move it to the correct place. Reported-and-tested-by: Konstantin Khlebnikov Signed-off-by: Marcelo Tosatti Cc: Avi Kivity Signed-off-by: Ingo Molnar --- arch/x86/power/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 47936830968c..218cdb16163c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -225,13 +225,13 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); } /* Needed by apm.c */ void restore_processor_state(void) { - x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 7b8e6da46b921d30ac1553cac56d8fb74f0b431d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Mar 2012 16:50:42 +0200 Subject: perf/x86/p4: Add format attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Steven reported his P4 not booting properly, the missing format attributes cause a NULL ptr deref. Cure this by adding the missing format specification. I took the format description out of the comment near p4_config_pack*() and hope that comment is still relatively accurate. Reported-by: Steven Rostedt Reported-by: Bruno Prémont Tested-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Jiri Olsa Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1332859842.16159.227.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ef484d9d0a25..a2dfacfd7103 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1271,6 +1271,17 @@ done: return num ? -EINVAL : 0; } +PMU_FORMAT_ATTR(cccr, "config:0-31" ); +PMU_FORMAT_ATTR(escr, "config:32-62"); +PMU_FORMAT_ATTR(ht, "config:63" ); + +static struct attribute *intel_p4_formats_attr[] = { + &format_attr_cccr.attr, + &format_attr_escr.attr, + &format_attr_ht.attr, + NULL, +}; + static __initconst const struct x86_pmu p4_pmu = { .name = "Netburst P4/Xeon", .handle_irq = p4_pmu_handle_irq, @@ -1305,6 +1316,8 @@ static __initconst const struct x86_pmu p4_pmu = { * the former idea is taken from OProfile code */ .perfctr_second_write = 1, + + .format_attrs = intel_p4_formats_attr, }; __init int p4_pmu_init(void) -- cgit v1.2.3 From a998d4342337c82dacdc0897d30a9364de1576a1 Mon Sep 17 00:00:00 2001 From: Jan Seiffert Date: Fri, 30 Mar 2012 05:24:05 +0000 Subject: bpf jit: Let the x86 jit handle negative offsets Now the helper function from filter.c for negative offsets is exported, it can be used it in the jit to handle negative offsets. First modify the asm load helper functions to handle: - know positive offsets - know negative offsets - any offset then the compiler can be modified to explicitly use these helper when appropriate. This fixes the case of a negative X register and allows to lift the restriction that bpf programs with negative offsets can't be jited. Signed-of-by: Jan Seiffert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit.S | 122 +++++++++++++++++++++++++++++++++----------- arch/x86/net/bpf_jit_comp.c | 41 +++++++++------ 2 files changed, 115 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 66870223f8c5..877b9a1b2152 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -18,17 +18,17 @@ * r9d : hlen = skb->len - skb->data_len */ #define SKBDATA %r8 - -sk_load_word_ind: - .globl sk_load_word_ind - - add %ebx,%esi /* offset += X */ -# test %esi,%esi /* if (offset < 0) goto bpf_error; */ - js bpf_error +#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ sk_load_word: .globl sk_load_word + test %esi,%esi + js bpf_slow_path_word_neg + +sk_load_word_positive_offset: + .globl sk_load_word_positive_offset + mov %r9d,%eax # hlen sub %esi,%eax # hlen - offset cmp $3,%eax @@ -37,16 +37,15 @@ sk_load_word: bswap %eax /* ntohl() */ ret - -sk_load_half_ind: - .globl sk_load_half_ind - - add %ebx,%esi /* offset += X */ - js bpf_error - sk_load_half: .globl sk_load_half + test %esi,%esi + js bpf_slow_path_half_neg + +sk_load_half_positive_offset: + .globl sk_load_half_positive_offset + mov %r9d,%eax sub %esi,%eax # hlen - offset cmp $1,%eax @@ -55,14 +54,15 @@ sk_load_half: rol $8,%ax # ntohs() ret -sk_load_byte_ind: - .globl sk_load_byte_ind - add %ebx,%esi /* offset += X */ - js bpf_error - sk_load_byte: .globl sk_load_byte + test %esi,%esi + js bpf_slow_path_byte_neg + +sk_load_byte_positive_offset: + .globl sk_load_byte_positive_offset + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ jle bpf_slow_path_byte movzbl (SKBDATA,%rsi),%eax @@ -73,25 +73,21 @@ sk_load_byte: * * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf) * Must preserve A accumulator (%eax) - * Inputs : %esi is the offset value, already known positive + * Inputs : %esi is the offset value */ -ENTRY(sk_load_byte_msh) - CFI_STARTPROC +sk_load_byte_msh: + .globl sk_load_byte_msh + test %esi,%esi + js bpf_slow_path_byte_msh_neg + +sk_load_byte_msh_positive_offset: + .globl sk_load_byte_msh_positive_offset cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */ jle bpf_slow_path_byte_msh movzbl (SKBDATA,%rsi),%ebx and $15,%bl shl $2,%bl ret - CFI_ENDPROC -ENDPROC(sk_load_byte_msh) - -bpf_error: -# force a return 0 from jit handler - xor %eax,%eax - mov -8(%rbp),%rbx - leaveq - ret /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ @@ -138,3 +134,67 @@ bpf_slow_path_byte_msh: shl $2,%al xchg %eax,%ebx ret + +#define sk_negative_common(SIZE) \ + push %rdi; /* save skb */ \ + push %r9; \ + push SKBDATA; \ +/* rsi already has offset */ \ + mov $SIZE,%ecx; /* size */ \ + call bpf_internal_load_pointer_neg_helper; \ + test %rax,%rax; \ + pop SKBDATA; \ + pop %r9; \ + pop %rdi; \ + jz bpf_error + + +bpf_slow_path_word_neg: + cmp SKF_MAX_NEG_OFF, %esi /* test range */ + jl bpf_error /* offset lower -> error */ +sk_load_word_negative_offset: + .globl sk_load_word_negative_offset + sk_negative_common(4) + mov (%rax), %eax + bswap %eax + ret + +bpf_slow_path_half_neg: + cmp SKF_MAX_NEG_OFF, %esi + jl bpf_error +sk_load_half_negative_offset: + .globl sk_load_half_negative_offset + sk_negative_common(2) + mov (%rax),%ax + rol $8,%ax + movzwl %ax,%eax + ret + +bpf_slow_path_byte_neg: + cmp SKF_MAX_NEG_OFF, %esi + jl bpf_error +sk_load_byte_negative_offset: + .globl sk_load_byte_negative_offset + sk_negative_common(1) + movzbl (%rax), %eax + ret + +bpf_slow_path_byte_msh_neg: + cmp SKF_MAX_NEG_OFF, %esi + jl bpf_error +sk_load_byte_msh_negative_offset: + .globl sk_load_byte_msh_negative_offset + xchg %eax,%ebx /* dont lose A , X is about to be scratched */ + sk_negative_common(1) + movzbl (%rax),%eax + and $15,%al + shl $2,%al + xchg %eax,%ebx + ret + +bpf_error: +# force a return 0 from jit handler + xor %eax,%eax + mov -8(%rbp),%rbx + leaveq + ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a5b6e4dd738..0597f95b6da6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -30,7 +30,10 @@ int bpf_jit_enable __read_mostly; * assembly code in arch/x86/net/bpf_jit.S */ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; -extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[]; +extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[]; +extern u8 sk_load_byte_positive_offset[], sk_load_byte_msh_positive_offset[]; +extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[]; +extern u8 sk_load_byte_negative_offset[], sk_load_byte_msh_negative_offset[]; static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -117,6 +120,8 @@ static inline void bpf_flush_icache(void *start, void *end) set_fs(old_fs); } +#define CHOOSE_LOAD_FUNC(K, func) \ + ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) void bpf_jit_compile(struct sk_filter *fp) { @@ -473,44 +478,46 @@ void bpf_jit_compile(struct sk_filter *fp) #endif break; case BPF_S_LD_W_ABS: - func = sk_load_word; + func = CHOOSE_LOAD_FUNC(K, sk_load_word); common_load: seen |= SEEN_DATAREF; - if ((int)K < 0) { - /* Abort the JIT because __load_pointer() is needed. */ - goto out; - } t_offset = func - (image + addrs[i]); EMIT1_off32(0xbe, K); /* mov imm32,%esi */ EMIT1_off32(0xe8, t_offset); /* call */ break; case BPF_S_LD_H_ABS: - func = sk_load_half; + func = CHOOSE_LOAD_FUNC(K, sk_load_half); goto common_load; case BPF_S_LD_B_ABS: - func = sk_load_byte; + func = CHOOSE_LOAD_FUNC(K, sk_load_byte); goto common_load; case BPF_S_LDX_B_MSH: - if ((int)K < 0) { - /* Abort the JIT because __load_pointer() is needed. */ - goto out; - } + func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh); seen |= SEEN_DATAREF | SEEN_XREG; - t_offset = sk_load_byte_msh - (image + addrs[i]); + t_offset = func - (image + addrs[i]); EMIT1_off32(0xbe, K); /* mov imm32,%esi */ EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */ break; case BPF_S_LD_W_IND: - func = sk_load_word_ind; + func = sk_load_word; common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; t_offset = func - (image + addrs[i]); - EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + if (K) { + if (is_imm8(K)) { + EMIT3(0x8d, 0x73, K); /* lea imm8(%rbx), %esi */ + } else { + EMIT2(0x8d, 0xb3); /* lea imm32(%rbx),%esi */ + EMIT(K, 4); + } + } else { + EMIT2(0x89,0xde); /* mov %ebx,%esi */ + } EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */ break; case BPF_S_LD_H_IND: - func = sk_load_half_ind; + func = sk_load_half; goto common_load_ind; case BPF_S_LD_B_IND: - func = sk_load_byte_ind; + func = sk_load_byte; goto common_load_ind; case BPF_S_JMP_JA: t_offset = addrs[i + K] - addrs[i]; -- cgit v1.2.3 From fea5295324ce7ce6ccfe0909cc6740a2d34aa5a3 Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Wed, 21 Mar 2012 18:49:00 +0530 Subject: KVM: PMU: Fix integer constant is too large warning in kvm_pmu_set_msr() Signed-off-by: Sasikantha babu Signed-off-by: Avi Kivity --- arch/x86/kvm/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a73f0c104813..173df38dbda5 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -369,7 +369,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444)) { + if (!(data & 0xfffffffffffff444ull)) { reprogram_fixed_counters(pmu, data); return 0; } -- cgit v1.2.3 From 7a4f5ad051e02139a9f1c0f7f4b1acb88915852b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Mar 2012 19:47:26 -0300 Subject: KVM: VMX: vmx_set_cr0 expects kvm->srcu locked vmx_set_cr0 is called from vcpu run context, therefore it expects kvm->srcu to be held (for setting up the real-mode TSS). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 280751c84724..ad85adfef843 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3906,7 +3906,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); -- cgit v1.2.3 From e08759215b7dcb7111e94f0f96918dd98e86ca6b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 4 Apr 2012 15:30:33 +0300 Subject: KVM: Resolve RCU vs. async page fault problem "Page ready" async PF can kick vcpu out of idle state much like IRQ. We need to tell RCU about this. Reported-by: Sasha Levin Signed-off-by: Gleb Natapov Reviewed-by: Paul E. McKenney Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 694d801bf606..b8ba6e4a27e4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -38,6 +38,7 @@ #include #include #include +#include static int kvmapf = 1; @@ -253,7 +254,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) kvm_async_pf_task_wait((u32)read_cr2()); break; case KVM_PV_REASON_PAGE_READY: + rcu_irq_enter(); + exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); + rcu_irq_exit(); break; } } -- cgit v1.2.3 From 234e340582901211f40d8c732afc49f0630ecf05 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 5 Apr 2012 14:25:11 -0700 Subject: simple_open: automatically convert to simple_open() Many users of debugfs copy the implementation of default_open() when they want to support a custom read/write function op. This leads to a proliferation of the default_open() implementation across the entire tree. Now that the common implementation has been consolidated into libfs we can replace all the users of this function with simple_open(). This replacement was done with the following semantic patch: @ open @ identifier open_f != simple_open; identifier i, f; @@ -int open_f(struct inode *i, struct file *f) -{ ( -if (i->i_private) -f->private_data = i->i_private; | -f->private_data = i->i_private; ) -return 0; -} @ has_open depends on open @ identifier fops; identifier open.open_f; @@ struct file_operations fops = { ... -.open = open_f, +.open = simple_open, ... }; [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stephen Boyd Cc: Greg Kroah-Hartman Cc: Al Viro Cc: Julia Lawall Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-msm/smd_debug.c | 8 +---- arch/x86/kernel/kdebugfs.c | 9 +----- drivers/acpi/ec_sys.c | 8 +---- drivers/base/regmap/regmap-debugfs.c | 12 ++----- drivers/bluetooth/btmrvl_debugfs.c | 26 ++++++--------- drivers/char/virtio_console.c | 8 +---- drivers/dma/coh901318.c | 9 +----- drivers/gpu/drm/i915/i915_debugfs.c | 14 ++------- drivers/hid/hid-picolcd.c | 16 ++-------- drivers/hid/hid-wiimote-debug.c | 8 +---- drivers/idle/i7300_idle.c | 8 +---- drivers/iommu/omap-iommu-debug.c | 10 ++---- drivers/mfd/aat2870-core.c | 9 +----- drivers/mfd/ab3100-core.c | 8 +---- drivers/misc/ibmasm/ibmasmfs.c | 8 +---- drivers/mtd/ubi/debug.c | 10 +----- drivers/net/caif/caif_spi.c | 10 ++---- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 9 +----- drivers/net/wimax/i2400m/debugfs.c | 15 ++------- drivers/net/wireless/ath/ath5k/debug.c | 23 +++++--------- drivers/net/wireless/ath/ath6kl/debug.c | 42 +++++++++++-------------- drivers/net/wireless/ath/ath9k/debug.c | 37 ++++++++++------------ drivers/net/wireless/ath/ath9k/dfs_debug.c | 9 +----- drivers/net/wireless/ath/ath9k/htc_drv_debug.c | 26 ++++++--------- drivers/net/wireless/ath/ath9k/rc.c | 8 +---- drivers/net/wireless/ath/carl9170/debug.c | 7 +---- drivers/net/wireless/b43/debugfs.c | 8 +---- drivers/net/wireless/b43legacy/debugfs.c | 8 +---- drivers/net/wireless/iwlegacy/3945-rs.c | 8 +---- drivers/net/wireless/iwlegacy/4965-rs.c | 12 ++----- drivers/net/wireless/iwlegacy/debug.c | 12 ++----- drivers/net/wireless/iwlwifi/iwl-agn-rs.c | 11 ++----- drivers/net/wireless/iwlwifi/iwl-debugfs.c | 12 ++----- drivers/net/wireless/iwlwifi/iwl-trans-pcie.c | 12 ++----- drivers/net/wireless/iwmc3200wifi/debugfs.c | 14 +++------ drivers/net/wireless/iwmc3200wifi/sdio.c | 9 +----- drivers/net/wireless/libertas/debugfs.c | 10 ++---- drivers/net/wireless/mwifiex/debugfs.c | 18 ++--------- drivers/net/wireless/wl1251/debugfs.c | 14 +++------ drivers/net/wireless/wl12xx/debugfs.c | 38 ++++++++++------------ drivers/oprofile/oprofilefs.c | 14 ++------- drivers/remoteproc/remoteproc_debugfs.c | 13 ++------ drivers/scsi/lpfc/lpfc_debugfs.c | 9 +----- drivers/spi/spi-dw.c | 8 +---- drivers/tty/serial/mfd.c | 9 ++---- drivers/tty/serial/pch_uart.c | 8 ++--- drivers/usb/core/inode.c | 10 +----- drivers/usb/host/ehci-dbg.c | 9 +----- drivers/uwb/uwb-debug.c | 9 +----- fs/debugfs/file.c | 14 ++------- fs/dlm/debug_fs.c | 9 +----- fs/pstore/inode.c | 8 +---- kernel/trace/blktrace.c | 18 ++--------- net/mac80211/debugfs.c | 12 ++----- net/mac80211/debugfs.h | 1 - net/mac80211/debugfs_key.c | 4 +-- net/mac80211/debugfs_netdev.c | 2 +- net/mac80211/debugfs_sta.c | 4 +-- net/mac80211/rate.c | 2 +- net/wireless/debugfs.c | 10 ++---- sound/soc/imx/imx-audmux.c | 8 +---- sound/soc/soc-core.c | 8 +---- sound/soc/soc-dapm.c | 16 ++-------- 63 files changed, 176 insertions(+), 572 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/mach-msm/smd_debug.c b/arch/arm/mach-msm/smd_debug.c index 0c56a5aaf588..c56df9e932ae 100644 --- a/arch/arm/mach-msm/smd_debug.c +++ b/arch/arm/mach-msm/smd_debug.c @@ -203,15 +203,9 @@ static ssize_t debug_read(struct file *file, char __user *buf, return simple_read_from_buffer(buf, count, ppos, debug_buffer, bsize); } -static int debug_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations debug_ops = { .read = debug_read, - .open = debug_open, + .open = simple_open, .llseek = default_llseek, }; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 90fcf62854bb..1d5d31ea686b 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, return count; } -static int setup_data_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static const struct file_operations fops_setup_data = { .read = setup_data_read, - .open = setup_data_open, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c index b258cab9061c..7586544fddb4 100644 --- a/drivers/acpi/ec_sys.c +++ b/drivers/acpi/ec_sys.c @@ -27,12 +27,6 @@ MODULE_PARM_DESC(write_support, "Dangerous, reboot and removal of battery may " static struct dentry *acpi_ec_debugfs_dir; -static int acpi_ec_open_io(struct inode *i, struct file *f) -{ - f->private_data = i->i_private; - return 0; -} - static ssize_t acpi_ec_read_io(struct file *f, char __user *buf, size_t count, loff_t *off) { @@ -95,7 +89,7 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf, static const struct file_operations acpi_ec_io_ops = { .owner = THIS_MODULE, - .open = acpi_ec_open_io, + .open = simple_open, .read = acpi_ec_read_io, .write = acpi_ec_write_io, .llseek = default_llseek, diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 58517a5dac13..251eb70f83e7 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -27,12 +27,6 @@ static size_t regmap_calc_reg_len(int max_val, char *buf, size_t buf_size) return strlen(buf); } -static int regmap_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t regmap_name_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -57,7 +51,7 @@ static ssize_t regmap_name_read_file(struct file *file, } static const struct file_operations regmap_name_fops = { - .open = regmap_open_file, + .open = simple_open, .read = regmap_name_read_file, .llseek = default_llseek, }; @@ -174,7 +168,7 @@ static ssize_t regmap_map_write_file(struct file *file, #endif static const struct file_operations regmap_map_fops = { - .open = regmap_open_file, + .open = simple_open, .read = regmap_map_read_file, .write = regmap_map_write_file, .llseek = default_llseek, @@ -243,7 +237,7 @@ out: } static const struct file_operations regmap_access_fops = { - .open = regmap_open_file, + .open = simple_open, .read = regmap_access_read_file, .llseek = default_llseek, }; diff --git a/drivers/bluetooth/btmrvl_debugfs.c b/drivers/bluetooth/btmrvl_debugfs.c index 6c20bbb54b71..428dbb7574bd 100644 --- a/drivers/bluetooth/btmrvl_debugfs.c +++ b/drivers/bluetooth/btmrvl_debugfs.c @@ -45,12 +45,6 @@ struct btmrvl_debugfs_data { struct dentry *txdnldready; }; -static int btmrvl_open_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t btmrvl_hscfgcmd_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -93,7 +87,7 @@ static ssize_t btmrvl_hscfgcmd_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_hscfgcmd_fops = { .read = btmrvl_hscfgcmd_read, .write = btmrvl_hscfgcmd_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -134,7 +128,7 @@ static ssize_t btmrvl_psmode_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_psmode_fops = { .read = btmrvl_psmode_read, .write = btmrvl_psmode_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -180,7 +174,7 @@ static ssize_t btmrvl_pscmd_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_pscmd_fops = { .read = btmrvl_pscmd_read, .write = btmrvl_pscmd_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -221,7 +215,7 @@ static ssize_t btmrvl_gpiogap_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_gpiogap_fops = { .read = btmrvl_gpiogap_read, .write = btmrvl_gpiogap_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -265,7 +259,7 @@ static ssize_t btmrvl_hscmd_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_hscmd_fops = { .read = btmrvl_hscmd_read, .write = btmrvl_hscmd_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -305,7 +299,7 @@ static ssize_t btmrvl_hsmode_read(struct file *file, char __user * userbuf, static const struct file_operations btmrvl_hsmode_fops = { .read = btmrvl_hsmode_read, .write = btmrvl_hsmode_write, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -323,7 +317,7 @@ static ssize_t btmrvl_curpsmode_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_curpsmode_fops = { .read = btmrvl_curpsmode_read, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -341,7 +335,7 @@ static ssize_t btmrvl_psstate_read(struct file *file, char __user * userbuf, static const struct file_operations btmrvl_psstate_fops = { .read = btmrvl_psstate_read, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -359,7 +353,7 @@ static ssize_t btmrvl_hsstate_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_hsstate_fops = { .read = btmrvl_hsstate_read, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -378,7 +372,7 @@ static ssize_t btmrvl_txdnldready_read(struct file *file, char __user *userbuf, static const struct file_operations btmrvl_txdnldready_fops = { .read = btmrvl_txdnldready_read, - .open = btmrvl_open_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index b58b56187065..ddf86b6500b7 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1038,12 +1038,6 @@ static struct attribute_group port_attribute_group = { .attrs = port_sysfs_entries, }; -static int debugfs_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count, loff_t *offp) { @@ -1087,7 +1081,7 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, static const struct file_operations port_debugfs_ops = { .owner = THIS_MODULE, - .open = debugfs_open, + .open = simple_open, .read = debugfs_read, }; diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c index d65a718c0f9b..a63badcd2d6e 100644 --- a/drivers/dma/coh901318.c +++ b/drivers/dma/coh901318.c @@ -104,13 +104,6 @@ static void coh901318_list_print(struct coh901318_chan *cohc, static struct coh901318_base *debugfs_dma_base; static struct dentry *dma_dentry; -static int coh901318_debugfs_open(struct inode *inode, struct file *file) -{ - - file->private_data = inode->i_private; - return 0; -} - static int coh901318_debugfs_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) { @@ -158,7 +151,7 @@ static int coh901318_debugfs_read(struct file *file, char __user *buf, static const struct file_operations coh901318_debugfs_status_operations = { .owner = THIS_MODULE, - .open = coh901318_debugfs_open, + .open = simple_open, .read = coh901318_debugfs_read, .llseek = default_llseek, }; diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index fdb7ccefffbd..b505b70dba05 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1502,14 +1502,6 @@ static int i915_ppgtt_info(struct seq_file *m, void *data) return 0; } -static int -i915_debugfs_common_open(struct inode *inode, - struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - static ssize_t i915_wedged_read(struct file *filp, char __user *ubuf, @@ -1560,7 +1552,7 @@ i915_wedged_write(struct file *filp, static const struct file_operations i915_wedged_fops = { .owner = THIS_MODULE, - .open = i915_debugfs_common_open, + .open = simple_open, .read = i915_wedged_read, .write = i915_wedged_write, .llseek = default_llseek, @@ -1622,7 +1614,7 @@ i915_max_freq_write(struct file *filp, static const struct file_operations i915_max_freq_fops = { .owner = THIS_MODULE, - .open = i915_debugfs_common_open, + .open = simple_open, .read = i915_max_freq_read, .write = i915_max_freq_write, .llseek = default_llseek, @@ -1693,7 +1685,7 @@ i915_cache_sharing_write(struct file *filp, static const struct file_operations i915_cache_sharing_fops = { .owner = THIS_MODULE, - .open = i915_debugfs_common_open, + .open = simple_open, .read = i915_cache_sharing_read, .write = i915_cache_sharing_write, .llseek = default_llseek, diff --git a/drivers/hid/hid-picolcd.c b/drivers/hid/hid-picolcd.c index 12f9777c385d..45c3433f7986 100644 --- a/drivers/hid/hid-picolcd.c +++ b/drivers/hid/hid-picolcd.c @@ -1525,12 +1525,6 @@ static const struct file_operations picolcd_debug_reset_fops = { /* * The "eeprom" file */ -static int picolcd_debug_eeprom_open(struct inode *i, struct file *f) -{ - f->private_data = i->i_private; - return 0; -} - static ssize_t picolcd_debug_eeprom_read(struct file *f, char __user *u, size_t s, loff_t *off) { @@ -1618,7 +1612,7 @@ static ssize_t picolcd_debug_eeprom_write(struct file *f, const char __user *u, */ static const struct file_operations picolcd_debug_eeprom_fops = { .owner = THIS_MODULE, - .open = picolcd_debug_eeprom_open, + .open = simple_open, .read = picolcd_debug_eeprom_read, .write = picolcd_debug_eeprom_write, .llseek = generic_file_llseek, @@ -1627,12 +1621,6 @@ static const struct file_operations picolcd_debug_eeprom_fops = { /* * The "flash" file */ -static int picolcd_debug_flash_open(struct inode *i, struct file *f) -{ - f->private_data = i->i_private; - return 0; -} - /* record a flash address to buf (bounds check to be done by caller) */ static int _picolcd_flash_setaddr(struct picolcd_data *data, u8 *buf, long off) { @@ -1817,7 +1805,7 @@ static ssize_t picolcd_debug_flash_write(struct file *f, const char __user *u, */ static const struct file_operations picolcd_debug_flash_fops = { .owner = THIS_MODULE, - .open = picolcd_debug_flash_open, + .open = simple_open, .read = picolcd_debug_flash_read, .write = picolcd_debug_flash_write, .llseek = generic_file_llseek, diff --git a/drivers/hid/hid-wiimote-debug.c b/drivers/hid/hid-wiimote-debug.c index 17dabc1f339e..eec329197c16 100644 --- a/drivers/hid/hid-wiimote-debug.c +++ b/drivers/hid/hid-wiimote-debug.c @@ -23,12 +23,6 @@ struct wiimote_debug { struct dentry *drm; }; -static int wiidebug_eeprom_open(struct inode *i, struct file *f) -{ - f->private_data = i->i_private; - return 0; -} - static ssize_t wiidebug_eeprom_read(struct file *f, char __user *u, size_t s, loff_t *off) { @@ -83,7 +77,7 @@ static ssize_t wiidebug_eeprom_read(struct file *f, char __user *u, size_t s, static const struct file_operations wiidebug_eeprom_fops = { .owner = THIS_MODULE, - .open = wiidebug_eeprom_open, + .open = simple_open, .read = wiidebug_eeprom_read, .llseek = generic_file_llseek, }; diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c index c976285d313e..fa080ebd568f 100644 --- a/drivers/idle/i7300_idle.c +++ b/drivers/idle/i7300_idle.c @@ -516,12 +516,6 @@ static struct notifier_block i7300_idle_nb = { MODULE_DEVICE_TABLE(pci, pci_tbl); -int stats_open_generic(struct inode *inode, struct file *fp) -{ - fp->private_data = inode->i_private; - return 0; -} - static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count, loff_t *off) { @@ -534,7 +528,7 @@ static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count, } static const struct file_operations idle_fops = { - .open = stats_open_generic, + .open = simple_open, .read = stats_read_ul, .llseek = default_llseek, }; diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c index 103dbd92e256..f55fc5dfbadc 100644 --- a/drivers/iommu/omap-iommu-debug.c +++ b/drivers/iommu/omap-iommu-debug.c @@ -323,15 +323,9 @@ err_out: return count; } -static int debug_open_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define DEBUG_FOPS(name) \ static const struct file_operations debug_##name##_fops = { \ - .open = debug_open_generic, \ + .open = simple_open, \ .read = debug_read_##name, \ .write = debug_write_##name, \ .llseek = generic_file_llseek, \ @@ -339,7 +333,7 @@ static int debug_open_generic(struct inode *inode, struct file *file) #define DEBUG_FOPS_RO(name) \ static const struct file_operations debug_##name##_fops = { \ - .open = debug_open_generic, \ + .open = simple_open, \ .read = debug_read_##name, \ .llseek = generic_file_llseek, \ }; diff --git a/drivers/mfd/aat2870-core.c b/drivers/mfd/aat2870-core.c index 3aa36eb5c79b..44a3fdbadef4 100644 --- a/drivers/mfd/aat2870-core.c +++ b/drivers/mfd/aat2870-core.c @@ -262,13 +262,6 @@ static ssize_t aat2870_dump_reg(struct aat2870_data *aat2870, char *buf) return count; } -static int aat2870_reg_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static ssize_t aat2870_reg_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -330,7 +323,7 @@ static ssize_t aat2870_reg_write_file(struct file *file, } static const struct file_operations aat2870_reg_fops = { - .open = aat2870_reg_open_file, + .open = simple_open, .read = aat2870_reg_read_file, .write = aat2870_reg_write_file, }; diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index 60107ee166fc..1efad20fb175 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -483,12 +483,6 @@ struct ab3100_get_set_reg_priv { bool mode; }; -static int ab3100_get_set_reg_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t ab3100_get_set_reg(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -583,7 +577,7 @@ static ssize_t ab3100_get_set_reg(struct file *file, } static const struct file_operations ab3100_get_set_reg_fops = { - .open = ab3100_get_set_reg_open_file, + .open = simple_open, .write = ab3100_get_set_reg, .llseek = noop_llseek, }; diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index 1c034b80d408..6673e578b3e9 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -500,12 +500,6 @@ static ssize_t r_heartbeat_file_write(struct file *file, const char __user *buf, return 1; } -static int remote_settings_file_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static int remote_settings_file_close(struct inode *inode, struct file *file) { return 0; @@ -600,7 +594,7 @@ static const struct file_operations r_heartbeat_fops = { }; static const struct file_operations remote_settings_fops = { - .open = remote_settings_file_open, + .open = simple_open, .release = remote_settings_file_close, .read = remote_settings_file_read, .write = remote_settings_file_write, diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index e2cdebf40840..61af9bb560ab 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -386,19 +386,11 @@ out: return count; } -static int default_open(struct inode *inode, struct file *file) -{ - if (inode->i_private) - file->private_data = inode->i_private; - - return 0; -} - /* File operations for all UBI debugfs files */ static const struct file_operations dfs_fops = { .read = dfs_file_read, .write = dfs_file_write, - .open = default_open, + .open = simple_open, .llseek = no_llseek, .owner = THIS_MODULE, }; diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c index 96391c36fa74..b71ce9bf0afb 100644 --- a/drivers/net/caif/caif_spi.c +++ b/drivers/net/caif/caif_spi.c @@ -127,12 +127,6 @@ static inline void dev_debugfs_rem(struct cfspi *cfspi) debugfs_remove(cfspi->dbgfs_dir); } -static int dbgfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t dbgfs_state(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -243,13 +237,13 @@ static ssize_t dbgfs_frame(struct file *file, char __user *user_buf, } static const struct file_operations dbgfs_state_fops = { - .open = dbgfs_open, + .open = simple_open, .read = dbgfs_state, .owner = THIS_MODULE }; static const struct file_operations dbgfs_frame_fops = { - .open = dbgfs_open, + .open = simple_open, .read = dbgfs_frame, .owner = THIS_MODULE }; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 05ff076af06d..b126b98065a9 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -2000,13 +2000,6 @@ static const struct ethtool_ops cxgb_ethtool_ops = { /* * debugfs support */ - -static int mem_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2050,7 +2043,7 @@ static ssize_t mem_read(struct file *file, char __user *buf, size_t count, static const struct file_operations mem_debugfs_fops = { .owner = THIS_MODULE, - .open = mem_open, + .open = simple_open, .read = mem_read, .llseek = default_llseek, }; diff --git a/drivers/net/wimax/i2400m/debugfs.c b/drivers/net/wimax/i2400m/debugfs.c index 129ba36bd04d..4b66ab1d0e5c 100644 --- a/drivers/net/wimax/i2400m/debugfs.c +++ b/drivers/net/wimax/i2400m/debugfs.c @@ -53,17 +53,6 @@ struct dentry *debugfs_create_netdev_queue_stopped( &fops_netdev_queue_stopped); } - -/* - * inode->i_private has the @data argument to debugfs_create_file() - */ -static -int i2400m_stats_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - /* * We don't allow partial reads of this file, as then the reader would * get weirdly confused data as it is updated. @@ -117,7 +106,7 @@ ssize_t i2400m_rx_stats_write(struct file *filp, const char __user *buffer, static const struct file_operations i2400m_rx_stats_fops = { .owner = THIS_MODULE, - .open = i2400m_stats_open, + .open = simple_open, .read = i2400m_rx_stats_read, .write = i2400m_rx_stats_write, .llseek = default_llseek, @@ -170,7 +159,7 @@ ssize_t i2400m_tx_stats_write(struct file *filp, const char __user *buffer, static const struct file_operations i2400m_tx_stats_fops = { .owner = THIS_MODULE, - .open = i2400m_stats_open, + .open = simple_open, .read = i2400m_tx_stats_read, .write = i2400m_tx_stats_write, .llseek = default_llseek, diff --git a/drivers/net/wireless/ath/ath5k/debug.c b/drivers/net/wireless/ath/ath5k/debug.c index 8c5ce8b0c734..e5e8f45d86ac 100644 --- a/drivers/net/wireless/ath/ath5k/debug.c +++ b/drivers/net/wireless/ath/ath5k/debug.c @@ -71,13 +71,6 @@ static unsigned int ath5k_debug; module_param_named(debug, ath5k_debug, uint, 0); -static int ath5k_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - - /* debugfs: registers */ struct reg { @@ -265,7 +258,7 @@ static ssize_t write_file_beacon(struct file *file, static const struct file_operations fops_beacon = { .read = read_file_beacon, .write = write_file_beacon, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -285,7 +278,7 @@ static ssize_t write_file_reset(struct file *file, static const struct file_operations fops_reset = { .write = write_file_reset, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = noop_llseek, }; @@ -365,7 +358,7 @@ static ssize_t write_file_debug(struct file *file, static const struct file_operations fops_debug = { .read = read_file_debug, .write = write_file_debug, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -477,7 +470,7 @@ static ssize_t write_file_antenna(struct file *file, static const struct file_operations fops_antenna = { .read = read_file_antenna, .write = write_file_antenna, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -532,7 +525,7 @@ static ssize_t read_file_misc(struct file *file, char __user *user_buf, static const struct file_operations fops_misc = { .read = read_file_misc, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, }; @@ -647,7 +640,7 @@ static ssize_t write_file_frameerrors(struct file *file, static const struct file_operations fops_frameerrors = { .read = read_file_frameerrors, .write = write_file_frameerrors, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -810,7 +803,7 @@ static ssize_t write_file_ani(struct file *file, static const struct file_operations fops_ani = { .read = read_file_ani, .write = write_file_ani, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -881,7 +874,7 @@ static ssize_t write_file_queue(struct file *file, static const struct file_operations fops_queue = { .read = read_file_queue, .write = write_file_queue, - .open = ath5k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/ath/ath6kl/debug.c b/drivers/net/wireless/ath/ath6kl/debug.c index 552adb3f80d0..d01403a263ff 100644 --- a/drivers/net/wireless/ath/ath6kl/debug.c +++ b/drivers/net/wireless/ath/ath6kl/debug.c @@ -217,12 +217,6 @@ void dump_cred_dist_stats(struct htc_target *target) target->credit_info->cur_free_credits); } -static int ath6kl_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - void ath6kl_debug_war(struct ath6kl *ar, enum ath6kl_war war) { switch (war) { @@ -263,7 +257,7 @@ static ssize_t read_file_war_stats(struct file *file, char __user *user_buf, static const struct file_operations fops_war_stats = { .read = read_file_war_stats, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -488,7 +482,7 @@ static ssize_t ath6kl_fwlog_mask_write(struct file *file, } static const struct file_operations fops_fwlog_mask = { - .open = ath6kl_debugfs_open, + .open = simple_open, .read = ath6kl_fwlog_mask_read, .write = ath6kl_fwlog_mask_write, .owner = THIS_MODULE, @@ -634,7 +628,7 @@ static ssize_t read_file_tgt_stats(struct file *file, char __user *user_buf, static const struct file_operations fops_tgt_stats = { .read = read_file_tgt_stats, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -699,7 +693,7 @@ static ssize_t read_file_credit_dist_stats(struct file *file, static const struct file_operations fops_credit_dist_stats = { .read = read_file_credit_dist_stats, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -802,7 +796,7 @@ static ssize_t ath6kl_endpoint_stats_write(struct file *file, } static const struct file_operations fops_endpoint_stats = { - .open = ath6kl_debugfs_open, + .open = simple_open, .read = ath6kl_endpoint_stats_read, .write = ath6kl_endpoint_stats_write, .owner = THIS_MODULE, @@ -875,7 +869,7 @@ static ssize_t ath6kl_regread_write(struct file *file, static const struct file_operations fops_diag_reg_read = { .read = ath6kl_regread_read, .write = ath6kl_regread_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -999,7 +993,7 @@ static ssize_t ath6kl_lrssi_roam_read(struct file *file, static const struct file_operations fops_lrssi_roam_threshold = { .read = ath6kl_lrssi_roam_read, .write = ath6kl_lrssi_roam_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1061,7 +1055,7 @@ static ssize_t ath6kl_regwrite_write(struct file *file, static const struct file_operations fops_diag_reg_write = { .read = ath6kl_regwrite_read, .write = ath6kl_regwrite_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1166,7 +1160,7 @@ static ssize_t ath6kl_roam_table_read(struct file *file, char __user *user_buf, static const struct file_operations fops_roam_table = { .read = ath6kl_roam_table_read, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1204,7 +1198,7 @@ static ssize_t ath6kl_force_roam_write(struct file *file, static const struct file_operations fops_force_roam = { .write = ath6kl_force_roam_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1244,7 +1238,7 @@ static ssize_t ath6kl_roam_mode_write(struct file *file, static const struct file_operations fops_roam_mode = { .write = ath6kl_roam_mode_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1286,7 +1280,7 @@ static ssize_t ath6kl_keepalive_write(struct file *file, } static const struct file_operations fops_keepalive = { - .open = ath6kl_debugfs_open, + .open = simple_open, .read = ath6kl_keepalive_read, .write = ath6kl_keepalive_write, .owner = THIS_MODULE, @@ -1331,7 +1325,7 @@ static ssize_t ath6kl_disconnect_timeout_write(struct file *file, } static const struct file_operations fops_disconnect_timeout = { - .open = ath6kl_debugfs_open, + .open = simple_open, .read = ath6kl_disconnect_timeout_read, .write = ath6kl_disconnect_timeout_write, .owner = THIS_MODULE, @@ -1512,7 +1506,7 @@ static ssize_t ath6kl_create_qos_write(struct file *file, static const struct file_operations fops_create_qos = { .write = ath6kl_create_qos_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1560,7 +1554,7 @@ static ssize_t ath6kl_delete_qos_write(struct file *file, static const struct file_operations fops_delete_qos = { .write = ath6kl_delete_qos_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1593,7 +1587,7 @@ static ssize_t ath6kl_bgscan_int_write(struct file *file, static const struct file_operations fops_bgscan_int = { .write = ath6kl_bgscan_int_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1651,7 +1645,7 @@ static ssize_t ath6kl_listen_int_read(struct file *file, static const struct file_operations fops_listen_int = { .read = ath6kl_listen_int_read, .write = ath6kl_listen_int_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1711,7 +1705,7 @@ static ssize_t ath6kl_power_params_write(struct file *file, static const struct file_operations fops_power_params = { .write = ath6kl_power_params_write, - .open = ath6kl_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/ath/ath9k/debug.c b/drivers/net/wireless/ath/ath9k/debug.c index 35d1c8e91d1c..ff47b32ecaf4 100644 --- a/drivers/net/wireless/ath/ath9k/debug.c +++ b/drivers/net/wireless/ath/ath9k/debug.c @@ -26,11 +26,6 @@ #define REG_READ_D(_ah, _reg) \ ath9k_hw_common(_ah)->ops->read((_ah), (_reg)) -static int ath9k_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} static ssize_t ath9k_debugfs_read_buf(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -83,7 +78,7 @@ static ssize_t write_file_debug(struct file *file, const char __user *user_buf, static const struct file_operations fops_debug = { .read = read_file_debug, .write = write_file_debug, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -129,7 +124,7 @@ static ssize_t write_file_tx_chainmask(struct file *file, const char __user *use static const struct file_operations fops_tx_chainmask = { .read = read_file_tx_chainmask, .write = write_file_tx_chainmask, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -172,7 +167,7 @@ static ssize_t write_file_rx_chainmask(struct file *file, const char __user *use static const struct file_operations fops_rx_chainmask = { .read = read_file_rx_chainmask, .write = write_file_rx_chainmask, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -223,7 +218,7 @@ static ssize_t write_file_disable_ani(struct file *file, static const struct file_operations fops_disable_ani = { .read = read_file_disable_ani, .write = write_file_disable_ani, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -324,7 +319,7 @@ static ssize_t read_file_dma(struct file *file, char __user *user_buf, static const struct file_operations fops_dma = { .read = read_file_dma, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -446,7 +441,7 @@ static ssize_t read_file_interrupt(struct file *file, char __user *user_buf, static const struct file_operations fops_interrupt = { .read = read_file_interrupt, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -852,28 +847,28 @@ void ath_debug_stat_tx(struct ath_softc *sc, struct ath_buf *bf, static const struct file_operations fops_xmit = { .read = read_file_xmit, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static const struct file_operations fops_stations = { .read = read_file_stations, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static const struct file_operations fops_misc = { .read = read_file_misc, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static const struct file_operations fops_reset = { .read = read_file_reset, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1016,7 +1011,7 @@ void ath_debug_stat_rx(struct ath_softc *sc, struct ath_rx_status *rs) static const struct file_operations fops_recv = { .read = read_file_recv, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1055,7 +1050,7 @@ static ssize_t write_file_regidx(struct file *file, const char __user *user_buf, static const struct file_operations fops_regidx = { .read = read_file_regidx, .write = write_file_regidx, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1102,7 +1097,7 @@ static ssize_t write_file_regval(struct file *file, const char __user *user_buf, static const struct file_operations fops_regval = { .read = read_file_regval, .write = write_file_regval, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1191,7 +1186,7 @@ static ssize_t read_file_dump_nfcal(struct file *file, char __user *user_buf, static const struct file_operations fops_dump_nfcal = { .read = read_file_dump_nfcal, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1219,7 +1214,7 @@ static ssize_t read_file_base_eeprom(struct file *file, char __user *user_buf, static const struct file_operations fops_base_eeprom = { .read = read_file_base_eeprom, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -1247,7 +1242,7 @@ static ssize_t read_file_modal_eeprom(struct file *file, char __user *user_buf, static const struct file_operations fops_modal_eeprom = { .read = read_file_modal_eeprom, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/ath/ath9k/dfs_debug.c b/drivers/net/wireless/ath/ath9k/dfs_debug.c index 106d031d834a..4364c103ed33 100644 --- a/drivers/net/wireless/ath/ath9k/dfs_debug.c +++ b/drivers/net/wireless/ath/ath9k/dfs_debug.c @@ -60,16 +60,9 @@ static ssize_t read_file_dfs(struct file *file, char __user *user_buf, return retval; } -static int ath9k_dfs_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static const struct file_operations fops_dfs_stats = { .read = read_file_dfs, - .open = ath9k_dfs_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c index d3ff33c71aa5..3035deb7a0cd 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c @@ -16,12 +16,6 @@ #include "htc.h" -static int ath9k_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t read_file_tgt_int_stats(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -75,7 +69,7 @@ static ssize_t read_file_tgt_int_stats(struct file *file, char __user *user_buf, static const struct file_operations fops_tgt_int_stats = { .read = read_file_tgt_int_stats, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -145,7 +139,7 @@ static ssize_t read_file_tgt_tx_stats(struct file *file, char __user *user_buf, static const struct file_operations fops_tgt_tx_stats = { .read = read_file_tgt_tx_stats, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -191,7 +185,7 @@ static ssize_t read_file_tgt_rx_stats(struct file *file, char __user *user_buf, static const struct file_operations fops_tgt_rx_stats = { .read = read_file_tgt_rx_stats, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -243,7 +237,7 @@ static ssize_t read_file_xmit(struct file *file, char __user *user_buf, static const struct file_operations fops_xmit = { .read = read_file_xmit, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -364,7 +358,7 @@ static ssize_t read_file_recv(struct file *file, char __user *user_buf, static const struct file_operations fops_recv = { .read = read_file_recv, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -399,7 +393,7 @@ static ssize_t read_file_slot(struct file *file, char __user *user_buf, static const struct file_operations fops_slot = { .read = read_file_slot, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -446,7 +440,7 @@ static ssize_t read_file_queue(struct file *file, char __user *user_buf, static const struct file_operations fops_queue = { .read = read_file_queue, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -487,7 +481,7 @@ static ssize_t write_file_debug(struct file *file, const char __user *user_buf, static const struct file_operations fops_debug = { .read = read_file_debug, .write = write_file_debug, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -636,7 +630,7 @@ static ssize_t read_file_base_eeprom(struct file *file, char __user *user_buf, static const struct file_operations fops_base_eeprom = { .read = read_file_base_eeprom, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -917,7 +911,7 @@ static ssize_t read_file_modal_eeprom(struct file *file, char __user *user_buf, static const struct file_operations fops_modal_eeprom = { .read = read_file_modal_eeprom, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c index 4f848493fece..08bb45532701 100644 --- a/drivers/net/wireless/ath/ath9k/rc.c +++ b/drivers/net/wireless/ath/ath9k/rc.c @@ -1480,12 +1480,6 @@ static void ath_rate_update(void *priv, struct ieee80211_supported_band *sband, #ifdef CONFIG_ATH9K_DEBUGFS -static int ath9k_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t read_file_rcstat(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -1553,7 +1547,7 @@ static ssize_t read_file_rcstat(struct file *file, char __user *user_buf, static const struct file_operations fops_rcstat = { .read = read_file_rcstat, - .open = ath9k_debugfs_open, + .open = simple_open, .owner = THIS_MODULE }; diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c index 3c164226687f..93fe6003a493 100644 --- a/drivers/net/wireless/ath/carl9170/debug.c +++ b/drivers/net/wireless/ath/carl9170/debug.c @@ -48,11 +48,6 @@ #define ADD(buf, off, max, fmt, args...) \ off += snprintf(&buf[off], max - off, fmt, ##args); -static int carl9170_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} struct carl9170_debugfs_fops { unsigned int read_bufsize; @@ -178,7 +173,7 @@ static const struct carl9170_debugfs_fops carl_debugfs_##name ##_ops = {\ .attr = _attr, \ .req_dev_state = _dstate, \ .fops = { \ - .open = carl9170_debugfs_open, \ + .open = simple_open, \ .read = carl9170_debugfs_read, \ .write = carl9170_debugfs_write, \ .owner = THIS_MODULE \ diff --git a/drivers/net/wireless/b43/debugfs.c b/drivers/net/wireless/b43/debugfs.c index e751fdee89b2..e807bd930647 100644 --- a/drivers/net/wireless/b43/debugfs.c +++ b/drivers/net/wireless/b43/debugfs.c @@ -500,12 +500,6 @@ out: #undef fappend -static int b43_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t b43_debugfs_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -624,7 +618,7 @@ out_unlock: .read = _read, \ .write = _write, \ .fops = { \ - .open = b43_debugfs_open, \ + .open = simple_open, \ .read = b43_debugfs_read, \ .write = b43_debugfs_write, \ .llseek = generic_file_llseek, \ diff --git a/drivers/net/wireless/b43legacy/debugfs.c b/drivers/net/wireless/b43legacy/debugfs.c index 5e28ad0d6d17..1965edb765a2 100644 --- a/drivers/net/wireless/b43legacy/debugfs.c +++ b/drivers/net/wireless/b43legacy/debugfs.c @@ -197,12 +197,6 @@ static int restart_write_file(struct b43legacy_wldev *dev, const char *buf, size #undef fappend -static int b43legacy_debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t b43legacy_debugfs_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -331,7 +325,7 @@ out_unlock: .read = _read, \ .write = _write, \ .fops = { \ - .open = b43legacy_debugfs_open, \ + .open = simple_open, \ .read = b43legacy_debugfs_read, \ .write = b43legacy_debugfs_write, \ .llseek = generic_file_llseek, \ diff --git a/drivers/net/wireless/iwlegacy/3945-rs.c b/drivers/net/wireless/iwlegacy/3945-rs.c index 70bee1a4d876..4b10157d8686 100644 --- a/drivers/net/wireless/iwlegacy/3945-rs.c +++ b/drivers/net/wireless/iwlegacy/3945-rs.c @@ -821,12 +821,6 @@ out: } #ifdef CONFIG_MAC80211_DEBUGFS -static int -il3945_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} static ssize_t il3945_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf, @@ -862,7 +856,7 @@ il3945_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf, static const struct file_operations rs_sta_dbgfs_stats_table_ops = { .read = il3945_sta_dbgfs_stats_table_read, - .open = il3945_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/iwlegacy/4965-rs.c b/drivers/net/wireless/iwlegacy/4965-rs.c index d7e2856e41d3..11ab1247fae1 100644 --- a/drivers/net/wireless/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/iwlegacy/4965-rs.c @@ -2518,12 +2518,6 @@ il4965_rs_free_sta(void *il_r, struct ieee80211_sta *sta, void *il_sta) } #ifdef CONFIG_MAC80211_DEBUGFS -static int -il4965_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} static void il4965_rs_dbgfs_set_mcs(struct il_lq_sta *lq_sta, u32 * rate_n_flags, int idx) @@ -2695,7 +2689,7 @@ il4965_rs_sta_dbgfs_scale_table_read(struct file *file, char __user *user_buf, static const struct file_operations rs_sta_dbgfs_scale_table_ops = { .write = il4965_rs_sta_dbgfs_scale_table_write, .read = il4965_rs_sta_dbgfs_scale_table_read, - .open = il4965_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -2740,7 +2734,7 @@ il4965_rs_sta_dbgfs_stats_table_read(struct file *file, char __user *user_buf, static const struct file_operations rs_sta_dbgfs_stats_table_ops = { .read = il4965_rs_sta_dbgfs_stats_table_read, - .open = il4965_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -2768,7 +2762,7 @@ il4965_rs_sta_dbgfs_rate_scale_data_read(struct file *file, static const struct file_operations rs_sta_dbgfs_rate_scale_data_ops = { .read = il4965_rs_sta_dbgfs_rate_scale_data_read, - .open = il4965_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/iwlegacy/debug.c b/drivers/net/wireless/iwlegacy/debug.c index 229849150aac..eff26501d60a 100644 --- a/drivers/net/wireless/iwlegacy/debug.c +++ b/drivers/net/wireless/iwlegacy/debug.c @@ -160,18 +160,12 @@ static ssize_t il_dbgfs_##name##_write(struct file *file, \ const char __user *user_buf, \ size_t count, loff_t *ppos); -static int -il_dbgfs_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} #define DEBUGFS_READ_FILE_OPS(name) \ DEBUGFS_READ_FUNC(name); \ static const struct file_operations il_dbgfs_##name##_ops = { \ .read = il_dbgfs_##name##_read, \ - .open = il_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -179,7 +173,7 @@ static const struct file_operations il_dbgfs_##name##_ops = { \ DEBUGFS_WRITE_FUNC(name); \ static const struct file_operations il_dbgfs_##name##_ops = { \ .write = il_dbgfs_##name##_write, \ - .open = il_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -189,7 +183,7 @@ static const struct file_operations il_dbgfs_##name##_ops = { \ static const struct file_operations il_dbgfs_##name##_ops = { \ .write = il_dbgfs_##name##_write, \ .read = il_dbgfs_##name##_read, \ - .open = il_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c index 53f8c51cfcdb..7e590b349dd7 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c @@ -3083,11 +3083,6 @@ static void rs_free_sta(void *priv_r, struct ieee80211_sta *sta, } #ifdef CONFIG_MAC80211_DEBUGFS -static int open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} static void rs_dbgfs_set_mcs(struct iwl_lq_sta *lq_sta, u32 *rate_n_flags, int index) { @@ -3226,7 +3221,7 @@ static ssize_t rs_sta_dbgfs_scale_table_read(struct file *file, static const struct file_operations rs_sta_dbgfs_scale_table_ops = { .write = rs_sta_dbgfs_scale_table_write, .read = rs_sta_dbgfs_scale_table_read, - .open = open_file_generic, + .open = simple_open, .llseek = default_llseek, }; static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file, @@ -3269,7 +3264,7 @@ static ssize_t rs_sta_dbgfs_stats_table_read(struct file *file, static const struct file_operations rs_sta_dbgfs_stats_table_ops = { .read = rs_sta_dbgfs_stats_table_read, - .open = open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -3295,7 +3290,7 @@ static ssize_t rs_sta_dbgfs_rate_scale_data_read(struct file *file, static const struct file_operations rs_sta_dbgfs_rate_scale_data_ops = { .read = rs_sta_dbgfs_rate_scale_data_read, - .open = open_file_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c index b7b1c04f2fba..2bbaebd99ad4 100644 --- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c +++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c @@ -84,17 +84,11 @@ static ssize_t iwl_dbgfs_##name##_write(struct file *file, \ size_t count, loff_t *ppos); -static int iwl_dbgfs_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define DEBUGFS_READ_FILE_OPS(name) \ DEBUGFS_READ_FUNC(name); \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .read = iwl_dbgfs_##name##_read, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -102,7 +96,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = { \ DEBUGFS_WRITE_FUNC(name); \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .write = iwl_dbgfs_##name##_write, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -113,7 +107,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = { \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .write = iwl_dbgfs_##name##_write, \ .read = iwl_dbgfs_##name##_read, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c index b4f796c82e1e..4d7b30d3e648 100644 --- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c @@ -1898,17 +1898,11 @@ static ssize_t iwl_dbgfs_##name##_write(struct file *file, \ size_t count, loff_t *ppos); -static int iwl_dbgfs_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define DEBUGFS_READ_FILE_OPS(name) \ DEBUGFS_READ_FUNC(name); \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .read = iwl_dbgfs_##name##_read, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -1916,7 +1910,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = { \ DEBUGFS_WRITE_FUNC(name); \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .write = iwl_dbgfs_##name##_write, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -1926,7 +1920,7 @@ static const struct file_operations iwl_dbgfs_##name##_ops = { \ static const struct file_operations iwl_dbgfs_##name##_ops = { \ .write = iwl_dbgfs_##name##_write, \ .read = iwl_dbgfs_##name##_read, \ - .open = iwl_dbgfs_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c index 87eef5773a02..b6199d124bb9 100644 --- a/drivers/net/wireless/iwmc3200wifi/debugfs.c +++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c @@ -99,12 +99,6 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_iwm_dbg_modules, iwm_debugfs_u32_read, iwm_debugfs_dbg_modules_write, "%llu\n"); -static int iwm_generic_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - static ssize_t iwm_debugfs_txq_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) @@ -401,28 +395,28 @@ out: static const struct file_operations iwm_debugfs_txq_fops = { .owner = THIS_MODULE, - .open = iwm_generic_open, + .open = simple_open, .read = iwm_debugfs_txq_read, .llseek = default_llseek, }; static const struct file_operations iwm_debugfs_tx_credit_fops = { .owner = THIS_MODULE, - .open = iwm_generic_open, + .open = simple_open, .read = iwm_debugfs_tx_credit_read, .llseek = default_llseek, }; static const struct file_operations iwm_debugfs_rx_ticket_fops = { .owner = THIS_MODULE, - .open = iwm_generic_open, + .open = simple_open, .read = iwm_debugfs_rx_ticket_read, .llseek = default_llseek, }; static const struct file_operations iwm_debugfs_fw_err_fops = { .owner = THIS_MODULE, - .open = iwm_generic_open, + .open = simple_open, .read = iwm_debugfs_fw_err_read, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/iwmc3200wifi/sdio.c b/drivers/net/wireless/iwmc3200wifi/sdio.c index 764b40dd24ad..0042f204b07f 100644 --- a/drivers/net/wireless/iwmc3200wifi/sdio.c +++ b/drivers/net/wireless/iwmc3200wifi/sdio.c @@ -264,13 +264,6 @@ static int if_sdio_send_chunk(struct iwm_priv *iwm, u8 *buf, int count) return ret; } -/* debugfs hooks */ -static int iwm_debugfs_sdio_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - static ssize_t iwm_debugfs_sdio_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -363,7 +356,7 @@ err: static const struct file_operations iwm_debugfs_sdio_fops = { .owner = THIS_MODULE, - .open = iwm_debugfs_sdio_open, + .open = simple_open, .read = iwm_debugfs_sdio_read, .llseek = default_llseek, }; diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c index c192671610fc..a06cc283e23d 100644 --- a/drivers/net/wireless/libertas/debugfs.c +++ b/drivers/net/wireless/libertas/debugfs.c @@ -21,12 +21,6 @@ static char *szStates[] = { static void lbs_debug_init(struct lbs_private *priv); #endif -static int open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t write_file_dummy(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -696,7 +690,7 @@ out_unlock: #define FOPS(fread, fwrite) { \ .owner = THIS_MODULE, \ - .open = open_file_generic, \ + .open = simple_open, \ .read = (fread), \ .write = (fwrite), \ .llseek = generic_file_llseek, \ @@ -962,7 +956,7 @@ static ssize_t lbs_debugfs_write(struct file *f, const char __user *buf, static const struct file_operations lbs_debug_fops = { .owner = THIS_MODULE, - .open = open_file_generic, + .open = simple_open, .write = lbs_debugfs_write, .read = lbs_debugfs_read, .llseek = default_llseek, diff --git a/drivers/net/wireless/mwifiex/debugfs.c b/drivers/net/wireless/mwifiex/debugfs.c index d26a78b6b3c4..1a845074c52a 100644 --- a/drivers/net/wireless/mwifiex/debugfs.c +++ b/drivers/net/wireless/mwifiex/debugfs.c @@ -139,18 +139,6 @@ static struct mwifiex_debug_data items[] = { static int num_of_items = ARRAY_SIZE(items); -/* - * Generic proc file open handler. - * - * This function is called every time a file is accessed for read or write. - */ -static int -mwifiex_open_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - /* * Proc info file read handler. * @@ -676,19 +664,19 @@ done: static const struct file_operations mwifiex_dfs_##name##_fops = { \ .read = mwifiex_##name##_read, \ .write = mwifiex_##name##_write, \ - .open = mwifiex_open_generic, \ + .open = simple_open, \ }; #define MWIFIEX_DFS_FILE_READ_OPS(name) \ static const struct file_operations mwifiex_dfs_##name##_fops = { \ .read = mwifiex_##name##_read, \ - .open = mwifiex_open_generic, \ + .open = simple_open, \ }; #define MWIFIEX_DFS_FILE_WRITE_OPS(name) \ static const struct file_operations mwifiex_dfs_##name##_fops = { \ .write = mwifiex_##name##_write, \ - .open = mwifiex_open_generic, \ + .open = simple_open, \ }; diff --git a/drivers/net/wireless/wl1251/debugfs.c b/drivers/net/wireless/wl1251/debugfs.c index 6c274007d200..448da1f8c22f 100644 --- a/drivers/net/wireless/wl1251/debugfs.c +++ b/drivers/net/wireless/wl1251/debugfs.c @@ -47,7 +47,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ \ static const struct file_operations name## _ops = { \ .read = name## _read, \ - .open = wl1251_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -84,7 +84,7 @@ static ssize_t sub## _ ##name## _read(struct file *file, \ \ static const struct file_operations sub## _ ##name## _ops = { \ .read = sub## _ ##name## _read, \ - .open = wl1251_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -117,12 +117,6 @@ out: mutex_unlock(&wl->mutex); } -static int wl1251_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - DEBUGFS_FWSTATS_FILE(tx, internal_desc_overflow, 20, "%u"); DEBUGFS_FWSTATS_FILE(rx, out_of_mem, 20, "%u"); @@ -235,7 +229,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf, static const struct file_operations tx_queue_len_ops = { .read = tx_queue_len_read, - .open = wl1251_open_file_generic, + .open = simple_open, .llseek = generic_file_llseek, }; @@ -257,7 +251,7 @@ static ssize_t tx_queue_status_read(struct file *file, char __user *userbuf, static const struct file_operations tx_queue_status_ops = { .read = tx_queue_status_read, - .open = wl1251_open_file_generic, + .open = simple_open, .llseek = generic_file_llseek, }; diff --git a/drivers/net/wireless/wl12xx/debugfs.c b/drivers/net/wireless/wl12xx/debugfs.c index e1cf72765965..564d49575c94 100644 --- a/drivers/net/wireless/wl12xx/debugfs.c +++ b/drivers/net/wireless/wl12xx/debugfs.c @@ -63,7 +63,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ \ static const struct file_operations name## _ops = { \ .read = name## _read, \ - .open = wl1271_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -96,7 +96,7 @@ static ssize_t sub## _ ##name## _read(struct file *file, \ \ static const struct file_operations sub## _ ##name## _ops = { \ .read = sub## _ ##name## _read, \ - .open = wl1271_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -126,12 +126,6 @@ out: mutex_unlock(&wl->mutex); } -static int wl1271_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - DEBUGFS_FWSTATS_FILE(tx, internal_desc_overflow, "%u"); DEBUGFS_FWSTATS_FILE(rx, out_of_mem, "%u"); @@ -243,7 +237,7 @@ static ssize_t tx_queue_len_read(struct file *file, char __user *userbuf, static const struct file_operations tx_queue_len_ops = { .read = tx_queue_len_read, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -289,7 +283,7 @@ static ssize_t gpio_power_write(struct file *file, static const struct file_operations gpio_power_ops = { .read = gpio_power_read, .write = gpio_power_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -308,7 +302,7 @@ static ssize_t start_recovery_write(struct file *file, static const struct file_operations start_recovery_ops = { .write = start_recovery_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -372,7 +366,7 @@ out: static const struct file_operations dynamic_ps_timeout_ops = { .read = dynamic_ps_timeout_read, .write = dynamic_ps_timeout_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -441,7 +435,7 @@ out: static const struct file_operations forced_ps_ops = { .read = forced_ps_read, .write = forced_ps_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -483,7 +477,7 @@ static ssize_t split_scan_timeout_write(struct file *file, static const struct file_operations split_scan_timeout_ops = { .read = split_scan_timeout_read, .write = split_scan_timeout_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -566,7 +560,7 @@ static ssize_t driver_state_read(struct file *file, char __user *user_buf, static const struct file_operations driver_state_ops = { .read = driver_state_read, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -675,7 +669,7 @@ static ssize_t vifs_state_read(struct file *file, char __user *user_buf, static const struct file_operations vifs_state_ops = { .read = vifs_state_read, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -733,7 +727,7 @@ static ssize_t dtim_interval_write(struct file *file, static const struct file_operations dtim_interval_ops = { .read = dtim_interval_read, .write = dtim_interval_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -791,7 +785,7 @@ static ssize_t suspend_dtim_interval_write(struct file *file, static const struct file_operations suspend_dtim_interval_ops = { .read = suspend_dtim_interval_read, .write = suspend_dtim_interval_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -849,7 +843,7 @@ static ssize_t beacon_interval_write(struct file *file, static const struct file_operations beacon_interval_ops = { .read = beacon_interval_read, .write = beacon_interval_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -904,7 +898,7 @@ static ssize_t rx_streaming_interval_read(struct file *file, static const struct file_operations rx_streaming_interval_ops = { .read = rx_streaming_interval_read, .write = rx_streaming_interval_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -959,7 +953,7 @@ static ssize_t rx_streaming_always_read(struct file *file, static const struct file_operations rx_streaming_always_ops = { .read = rx_streaming_always_read, .write = rx_streaming_always_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; @@ -1003,7 +997,7 @@ out: static const struct file_operations beacon_filtering_ops = { .write = beacon_filtering_write, - .open = wl1271_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index ee8fd037bb53..849357c1045c 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -117,25 +117,17 @@ static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_ } -static int default_open(struct inode *inode, struct file *filp) -{ - if (inode->i_private) - filp->private_data = inode->i_private; - return 0; -} - - static const struct file_operations ulong_fops = { .read = ulong_read_file, .write = ulong_write_file, - .open = default_open, + .open = simple_open, .llseek = default_llseek, }; static const struct file_operations ulong_ro_fops = { .read = ulong_read_file, - .open = default_open, + .open = simple_open, .llseek = default_llseek, }; @@ -187,7 +179,7 @@ static ssize_t atomic_read_file(struct file *file, char __user *buf, size_t coun static const struct file_operations atomic_ro_fops = { .read = atomic_read_file, - .open = default_open, + .open = simple_open, .llseek = default_llseek, }; diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c index 70277a530133..85d31a69e117 100644 --- a/drivers/remoteproc/remoteproc_debugfs.c +++ b/drivers/remoteproc/remoteproc_debugfs.c @@ -50,16 +50,9 @@ static ssize_t rproc_trace_read(struct file *filp, char __user *userbuf, return simple_read_from_buffer(userbuf, count, ppos, trace->va, len); } -static int rproc_open_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static const struct file_operations trace_rproc_ops = { .read = rproc_trace_read, - .open = rproc_open_generic, + .open = simple_open, .llseek = generic_file_llseek, }; @@ -94,7 +87,7 @@ static ssize_t rproc_state_read(struct file *filp, char __user *userbuf, static const struct file_operations rproc_state_ops = { .read = rproc_state_read, - .open = rproc_open_generic, + .open = simple_open, .llseek = generic_file_llseek, }; @@ -114,7 +107,7 @@ static ssize_t rproc_name_read(struct file *filp, char __user *userbuf, static const struct file_operations rproc_name_ops = { .read = rproc_name_read, - .open = rproc_open_generic, + .open = simple_open, .llseek = generic_file_llseek, }; diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 22e17be04d8a..34f7cf76bf4f 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -997,13 +997,6 @@ lpfc_debugfs_dumpDataDif_write(struct file *file, const char __user *buf, return nbytes; } -static int -lpfc_debugfs_dif_err_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t lpfc_debugfs_dif_err_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) @@ -3521,7 +3514,7 @@ static const struct file_operations lpfc_debugfs_op_dumpDif = { #undef lpfc_debugfs_op_dif_err static const struct file_operations lpfc_debugfs_op_dif_err = { .owner = THIS_MODULE, - .open = lpfc_debugfs_dif_err_open, + .open = simple_open, .llseek = lpfc_debugfs_lseek, .read = lpfc_debugfs_dif_err_read, .write = lpfc_debugfs_dif_err_write, diff --git a/drivers/spi/spi-dw.c b/drivers/spi/spi-dw.c index 082458d73ce9..d1a495f64e2d 100644 --- a/drivers/spi/spi-dw.c +++ b/drivers/spi/spi-dw.c @@ -63,12 +63,6 @@ struct chip_data { }; #ifdef CONFIG_DEBUG_FS -static int spi_show_regs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define SPI_REGS_BUFSIZE 1024 static ssize_t spi_show_regs(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -128,7 +122,7 @@ static ssize_t spi_show_regs(struct file *file, char __user *user_buf, static const struct file_operations mrst_spi_regs_ops = { .owner = THIS_MODULE, - .open = spi_show_regs_open, + .open = simple_open, .read = spi_show_regs, .llseek = default_llseek, }; diff --git a/drivers/tty/serial/mfd.c b/drivers/tty/serial/mfd.c index a9234ba8f8d5..c4b50af46c44 100644 --- a/drivers/tty/serial/mfd.c +++ b/drivers/tty/serial/mfd.c @@ -127,11 +127,6 @@ static inline void serial_out(struct uart_hsu_port *up, int offset, int value) #define HSU_REGS_BUFSIZE 1024 -static int hsu_show_regs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} static ssize_t port_show_regs(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -231,14 +226,14 @@ static ssize_t dma_show_regs(struct file *file, char __user *user_buf, static const struct file_operations port_regs_ops = { .owner = THIS_MODULE, - .open = hsu_show_regs_open, + .open = simple_open, .read = port_show_regs, .llseek = default_llseek, }; static const struct file_operations dma_regs_ops = { .owner = THIS_MODULE, - .open = hsu_show_regs_open, + .open = simple_open, .read = dma_show_regs, .llseek = default_llseek, }; diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c index 332f2eb8abbc..46ec722b4406 100644 --- a/drivers/tty/serial/pch_uart.c +++ b/drivers/tty/serial/pch_uart.c @@ -304,11 +304,7 @@ static const int trigger_level_1[4] = { 1, 1, 1, 1 }; #ifdef CONFIG_DEBUG_FS #define PCH_REGS_BUFSIZE 1024 -static int pch_show_regs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} + static ssize_t port_show_regs(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -362,7 +358,7 @@ static ssize_t port_show_regs(struct file *file, char __user *user_buf, static const struct file_operations port_regs_ops = { .owner = THIS_MODULE, - .open = pch_show_regs_open, + .open = simple_open, .read = port_show_regs, .llseek = default_llseek, }; diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index cefa0c8b5b6a..d2b9af59cba9 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -428,18 +428,10 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig) return retval; } -static int default_open (struct inode *inode, struct file *file) -{ - if (inode->i_private) - file->private_data = inode->i_private; - - return 0; -} - static const struct file_operations default_file_operations = { .read = default_read_file, .write = default_write_file, - .open = default_open, + .open = simple_open, .llseek = default_file_lseek, }; diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c index fd9109d7eb0e..680e1a31fb87 100644 --- a/drivers/usb/host/ehci-dbg.c +++ b/drivers/usb/host/ehci-dbg.c @@ -352,7 +352,6 @@ static int debug_async_open(struct inode *, struct file *); static int debug_periodic_open(struct inode *, struct file *); static int debug_registers_open(struct inode *, struct file *); static int debug_async_open(struct inode *, struct file *); -static int debug_lpm_open(struct inode *, struct file *); static ssize_t debug_lpm_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos); static ssize_t debug_lpm_write(struct file *file, const char __user *buffer, @@ -385,7 +384,7 @@ static const struct file_operations debug_registers_fops = { }; static const struct file_operations debug_lpm_fops = { .owner = THIS_MODULE, - .open = debug_lpm_open, + .open = simple_open, .read = debug_lpm_read, .write = debug_lpm_write, .release = debug_lpm_close, @@ -970,12 +969,6 @@ static int debug_registers_open(struct inode *inode, struct file *file) return file->private_data ? 0 : -ENOMEM; } -static int debug_lpm_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static int debug_lpm_close(struct inode *inode, struct file *file) { return 0; diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c index 2eecec0c13c9..6ec45beb7af5 100644 --- a/drivers/uwb/uwb-debug.c +++ b/drivers/uwb/uwb-debug.c @@ -159,13 +159,6 @@ static int cmd_ie_rm(struct uwb_rc *rc, struct uwb_dbg_cmd_ie *ie_to_rm) return uwb_rc_ie_rm(rc, ie_to_rm->data[0]); } -static int command_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - - return 0; -} - static ssize_t command_write(struct file *file, const char __user *buf, size_t len, loff_t *off) { @@ -206,7 +199,7 @@ static ssize_t command_write(struct file *file, const char __user *buf, } static const struct file_operations command_fops = { - .open = command_open, + .open = simple_open, .write = command_write, .read = NULL, .llseek = no_llseek, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 21e93605161c..5dfafdd1dbd3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -33,18 +33,10 @@ static ssize_t default_write_file(struct file *file, const char __user *buf, return count; } -static int default_open(struct inode *inode, struct file *file) -{ - if (inode->i_private) - file->private_data = inode->i_private; - - return 0; -} - const struct file_operations debugfs_file_operations = { .read = default_read_file, .write = default_write_file, - .open = default_open, + .open = simple_open, .llseek = noop_llseek, }; @@ -447,7 +439,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, static const struct file_operations fops_bool = { .read = read_file_bool, .write = write_file_bool, - .open = default_open, + .open = simple_open, .llseek = default_llseek, }; @@ -492,7 +484,7 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, static const struct file_operations fops_blob = { .read = read_file_blob, - .open = default_open, + .open = simple_open, .llseek = default_llseek, }; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 3dca2b39e83f..1c9b08095f98 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -609,13 +609,6 @@ static const struct file_operations format3_fops = { /* * dump lkb's on the ls_waiters list */ - -static int waiters_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t waiters_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -644,7 +637,7 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf, static const struct file_operations waiters_fops = { .owner = THIS_MODULE, - .open = waiters_open, + .open = simple_open, .read = waiters_read, .llseek = default_llseek, }; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index f37c32b94525..8ae5a03376ae 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -52,12 +52,6 @@ struct pstore_private { char data[]; }; -static int pstore_file_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -67,7 +61,7 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf, } static const struct file_operations pstore_file_operations = { - .open = pstore_file_open, + .open = simple_open, .read = pstore_file_read, .llseek = default_llseek, }; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cdea7b56b0c9..c0bd0308741c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_trace_remove); -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, - .open = blk_dropped_open, + .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { @@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, - .open = blk_msg_open, + .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index cc5b7a6e7e0b..778e5916d7c3 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -15,12 +15,6 @@ #include "rate.h" #include "debugfs.h" -int mac80211_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define DEBUGFS_FORMAT_BUFFER_SIZE 100 int mac80211_format_buffer(char __user *userbuf, size_t count, @@ -50,7 +44,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ #define DEBUGFS_READONLY_FILE_OPS(name) \ static const struct file_operations name## _ops = { \ .read = name## _read, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -93,7 +87,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf, static const struct file_operations reset_ops = { .write = reset_write, - .open = mac80211_open_file_generic, + .open = simple_open, .llseek = noop_llseek, }; @@ -254,7 +248,7 @@ static ssize_t stats_ ##name## _read(struct file *file, \ \ static const struct file_operations stats_ ##name## _ops = { \ .read = stats_ ##name## _read, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h index 7c87529630f5..9be4e6d71d00 100644 --- a/net/mac80211/debugfs.h +++ b/net/mac80211/debugfs.h @@ -3,7 +3,6 @@ #ifdef CONFIG_MAC80211_DEBUGFS extern void debugfs_hw_add(struct ieee80211_local *local); -extern int mac80211_open_file_generic(struct inode *inode, struct file *file); extern int mac80211_format_buffer(char __user *userbuf, size_t count, loff_t *ppos, char *fmt, ...); #else diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 59edcd95a58d..7932767bb482 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -30,7 +30,7 @@ static ssize_t key_##name##_read(struct file *file, \ #define KEY_OPS(name) \ static const struct file_operations key_ ##name## _ops = { \ .read = key_##name##_read, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ } @@ -45,7 +45,7 @@ static const struct file_operations key_ ##name## _ops = { \ #define KEY_CONF_OPS(name) \ static const struct file_operations key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index a32eeda04aa3..30f99c344847 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -135,7 +135,7 @@ static ssize_t ieee80211_if_read_##name(struct file *file, \ static const struct file_operations name##_ops = { \ .read = ieee80211_if_read_##name, \ .write = (_write), \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 6d45804d09bc..832b2da5e4cd 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -33,7 +33,7 @@ static ssize_t sta_ ##name## _read(struct file *file, \ #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ } @@ -41,7 +41,7 @@ static const struct file_operations sta_ ##name## _ops = { \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .write = sta_##name##_write, \ - .open = mac80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index b4f7600a3e36..3313c117b322 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -145,7 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf, static const struct file_operations rcname_ops = { .read = rcname_read, - .open = mac80211_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; #endif diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 39765bcfb472..920cabe0461b 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -13,12 +13,6 @@ #include "core.h" #include "debugfs.h" -static int cfg80211_open_file_generic(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - #define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ @@ -33,7 +27,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ \ static const struct file_operations name## _ops = { \ .read = name## _read, \ - .open = cfg80211_open_file_generic, \ + .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -102,7 +96,7 @@ static ssize_t ht40allow_map_read(struct file *file, static const struct file_operations ht40allow_map_ops = { .read = ht40allow_map_read, - .open = cfg80211_open_file_generic, + .open = simple_open, .llseek = default_llseek, }; diff --git a/sound/soc/imx/imx-audmux.c b/sound/soc/imx/imx-audmux.c index 601df809a26a..1765a197acb0 100644 --- a/sound/soc/imx/imx-audmux.c +++ b/sound/soc/imx/imx-audmux.c @@ -40,12 +40,6 @@ static void __iomem *audmux_base; #ifdef CONFIG_DEBUG_FS static struct dentry *audmux_debugfs_root; -static int audmux_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - /* There is an annoying discontinuity in the SSI numbering with regard * to the Linux number of the devices */ static const char *audmux_port_string(int port) @@ -142,7 +136,7 @@ static ssize_t audmux_read_file(struct file *file, char __user *user_buf, } static const struct file_operations audmux_debugfs_fops = { - .open = audmux_open_file, + .open = simple_open, .read = audmux_read_file, .llseek = default_llseek, }; diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index a4deebc0801a..e19c24ade414 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -201,12 +201,6 @@ static ssize_t pmdown_time_set(struct device *dev, static DEVICE_ATTR(pmdown_time, 0644, pmdown_time_show, pmdown_time_set); #ifdef CONFIG_DEBUG_FS -static int codec_reg_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t codec_reg_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -264,7 +258,7 @@ static ssize_t codec_reg_write_file(struct file *file, } static const struct file_operations codec_reg_fops = { - .open = codec_reg_open_file, + .open = simple_open, .read = codec_reg_read_file, .write = codec_reg_write_file, .llseek = default_llseek, diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 6241490fff30..5cbd2d7623b8 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1544,12 +1544,6 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event) } #ifdef CONFIG_DEBUG_FS -static int dapm_widget_power_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t dapm_widget_power_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -1613,17 +1607,11 @@ static ssize_t dapm_widget_power_read_file(struct file *file, } static const struct file_operations dapm_widget_power_fops = { - .open = dapm_widget_power_open_file, + .open = simple_open, .read = dapm_widget_power_read_file, .llseek = default_llseek, }; -static int dapm_bias_open_file(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static ssize_t dapm_bias_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -1654,7 +1642,7 @@ static ssize_t dapm_bias_read_file(struct file *file, char __user *user_buf, } static const struct file_operations dapm_bias_fops = { - .open = dapm_bias_open_file, + .open = simple_open, .read = dapm_bias_read_file, .llseek = default_llseek, }; -- cgit v1.2.3 From 46ed99d1b7c92920ce9e313152522847647aae4f Mon Sep 17 00:00:00 2001 From: Emil Goode Date: Sun, 1 Apr 2012 20:48:04 +0200 Subject: x86: vsyscall: Use NULL instead 0 for a pointer argument This patch silences the following sparse warning: arch/x86/kernel/vsyscall_64.c:250:34: warning: Using plain integer as NULL pointer Signed-off-by: Emil Goode Acked-by: Andy Lutomirski Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/1333306084-3776-1-git-send-email-emilgoode@gmail.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index f386dc49f988..7515cf0e1805 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -216,9 +216,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = 1; /* - * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, 0 means "don't write anything" not "write it at + * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ ret = -EFAULT; @@ -247,7 +247,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, - 0); + NULL); break; } -- cgit v1.2.3 From 2ca052a3710fac208eee690faefdeb8bbd4586a1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Apr 2012 16:15:33 -0700 Subject: x86: Use correct byte-sized register constraint in __xchg_op() x86-64 can access the low half of any register, but i386 can only do it with a subset of registers. 'r' causes compilation failures on i386, but 'q' expresses the constraint properly. Signed-off-by: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/4F7A3315.501@goop.org Reported-by: Leigh Scott Tested-by: Thomas Reitmayr Signed-off-by: H. Peter Anvin Cc: v3.3 --- arch/x86/include/asm/cmpxchg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index b3b733262909..bc18d0ed459a 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -43,7 +43,7 @@ extern void __add_wrong_size(void) switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ asm volatile (lock #op "b %b0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ + : "+q" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ -- cgit v1.2.3 From 8c91c5325e107ec17e40a59a47c6517387d64eb7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 6 Apr 2012 09:30:57 -0700 Subject: x86: Use correct byte-sized register constraint in __add() Similar to: 2ca052a x86: Use correct byte-sized register constraint in __xchg_op() ... the __add() macro also needs to use a "q" constraint in the byte-sized case, lest we try to generate an illegal register. Link: http://lkml.kernel.org/r/4F7A3315.501@goop.org Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Leigh Scott Cc: Thomas Reitmayr Cc: v3.3 --- arch/x86/include/asm/cmpxchg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index bc18d0ed459a..99480e55973d 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -173,7 +173,7 @@ extern void __add_wrong_size(void) switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ asm volatile (lock "addb %b1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ + : "+m" (*(ptr)) : "qi" (inc) \ : "memory", "cc"); \ break; \ case __X86_CASE_W: \ -- cgit v1.2.3 From f68e556e23d1a4176b563bcb25d8baf2c5313f91 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 Apr 2012 13:54:56 -0700 Subject: Make the "word-at-a-time" helper functions more commonly usable I have a new optimized x86 "strncpy_from_user()" that will use these same helper functions for all the same reasons the name lookup code uses them. This is preparation for that. This moves them into an architecture-specific header file. It's architecture-specific for two reasons: - some of the functions are likely to want architecture-specific implementations. Even if the current code happens to be "generic" in the sense that it should work on any little-endian machine, it's likely that the "multiply by a big constant and shift" implementation is less than optimal for an architecture that has a guaranteed fast bit count instruction, for example. - I expect that if architectures like sparc want to start playing around with this, we'll need to abstract out a few more details (in particular the actual unaligned accesses). So we're likely to have more architecture-specific stuff if non-x86 architectures start using this. (and if it turns out that non-x86 architectures don't start using this, then having it in an architecture-specific header is still the right thing to do, of course) Signed-off-by: Linus Torvalds --- arch/x86/include/asm/word-at-a-time.h | 46 +++++++++++++++++++++++++++++++++++ fs/namei.c | 35 +++----------------------- 2 files changed, 49 insertions(+), 32 deletions(-) create mode 100644 arch/x86/include/asm/word-at-a-time.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h new file mode 100644 index 000000000000..6fe6767b7124 --- /dev/null +++ b/arch/x86/include/asm/word-at-a-time.h @@ -0,0 +1,46 @@ +#ifndef _ASM_WORD_AT_A_TIME_H +#define _ASM_WORD_AT_A_TIME_H + +/* + * This is largely generic for little-endian machines, but the + * optimal byte mask counting is probably going to be something + * that is architecture-specific. If you have a reliably fast + * bit count instruction, that might be better than the multiply + * and shift, for example. + */ + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608ul >> 56; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#endif + +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80); +} + +#endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/fs/namei.c b/fs/namei.c index 1898198abc3d..0062dd17eb55 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1407,18 +1407,9 @@ static inline int can_lookup(struct inode *inode) */ #ifdef CONFIG_DCACHE_WORD_ACCESS -#ifdef CONFIG_64BIT +#include -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} +#ifdef CONFIG_64BIT static inline unsigned int fold_hash(unsigned long hash) { @@ -1428,15 +1419,6 @@ static inline unsigned int fold_hash(unsigned long hash) #else /* 32-bit case */ -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - #define fold_hash(x) (x) #endif @@ -1464,17 +1446,6 @@ done: } EXPORT_SYMBOL(full_name_hash); -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -#define ONEBYTES REPEAT_BYTE(0x01) -#define SLASHBYTES REPEAT_BYTE('/') -#define HIGHBITS REPEAT_BYTE(0x80) - -/* Return the high bit set in the first byte that is a zero */ -static inline unsigned long has_zero(unsigned long a) -{ - return ((a - ONEBYTES) & ~a) & HIGHBITS; -} - /* * Calculate the length and hash of the path component, and * return the length of the component; @@ -1490,7 +1461,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) len += sizeof(unsigned long); a = *(unsigned long *)(name+len); /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); } while (!mask); /* The mask *below* the first high bit set */ -- cgit v1.2.3 From 3cb42092ff02edec34bf936b7400b1f1efc8ca43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Apr 2012 13:59:00 -0400 Subject: um: fix linker script generation while we can't just use -U$(SUBARCH), we still need to kill idiotic define (implicit -Di386=1), both for SUBARCH=i386 and SUBARCH=x86/CONFIG_64BIT=n builds. Signed-off-by: Al Viro --- arch/um/kernel/Makefile | 7 ++++--- arch/x86/Makefile.um | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 492bc4c1b62b..65a1c3d690ea 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -3,9 +3,10 @@ # Licensed under the GPL # -CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ - -DELF_ARCH=$(LDS_ELF_ARCH) \ - -DELF_FORMAT=$(LDS_ELF_FORMAT) +CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ + -DELF_ARCH=$(LDS_ELF_ARCH) \ + -DELF_FORMAT=$(LDS_ELF_FORMAT) \ + $(LDS_EXTRA) extra-y := vmlinux.lds clean-files := diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 4be406abeefd..36b62bc52638 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -14,6 +14,9 @@ LINK-y += $(call cc-option,-m32) export LDFLAGS +LDS_EXTRA := -Ui386 +export LDS_EXTRA + # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. include $(srctree)/arch/x86/Makefile_32.cpu -- cgit v1.2.3 From a3a85a763c399c0bf483a30d82d2d613e6f94cd3 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 29 Mar 2012 18:47:46 +0200 Subject: um: Disintegrate asm/system.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Richard Weinberger Reported-by: Toralf Förster CC: dhowells@redhat.com --- arch/um/drivers/mconsole_kern.c | 1 + arch/um/include/asm/Kbuild | 2 +- arch/x86/um/asm/barrier.h | 75 ++++++++++++++++++++++ arch/x86/um/asm/switch_to.h | 7 +++ arch/x86/um/asm/system.h | 135 ---------------------------------------- 5 files changed, 84 insertions(+), 136 deletions(-) create mode 100644 arch/x86/um/asm/barrier.h create mode 100644 arch/x86/um/asm/switch_to.h delete mode 100644 arch/x86/um/asm/system.h (limited to 'arch/x86') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index e672bd6d43e3..43b39d61b538 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "init.h" #include "irq_kern.h" diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 8419f5cf2ac7..bb5d6e68829b 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,3 +1,3 @@ generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h -generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h +generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h new file mode 100644 index 000000000000..7d01b8c56c00 --- /dev/null +++ b/arch/x86/um/asm/barrier.h @@ -0,0 +1,75 @@ +#ifndef _ASM_UM_BARRIER_H_ +#define _ASM_UM_BARRIER_H_ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 + +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) + +#else /* CONFIG_X86_32 */ + +#define mb() asm volatile("mfence" : : : "memory") +#define rmb() asm volatile("lfence" : : : "memory") +#define wmb() asm volatile("sfence" : : : "memory") + +#endif /* CONFIG_X86_32 */ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP + +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +#define smp_rmb() rmb() +#else /* CONFIG_X86_PPRO_FENCE */ +#define smp_rmb() barrier() +#endif /* CONFIG_X86_PPRO_FENCE */ + +#ifdef CONFIG_X86_OOSTORE +#define smp_wmb() wmb() +#else /* CONFIG_X86_OOSTORE */ +#define smp_wmb() barrier() +#endif /* CONFIG_X86_OOSTORE */ + +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) + +#else /* CONFIG_SMP */ + +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) + +#endif /* CONFIG_SMP */ + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif diff --git a/arch/x86/um/asm/switch_to.h b/arch/x86/um/asm/switch_to.h new file mode 100644 index 000000000000..cf97d20da61f --- /dev/null +++ b/arch/x86/um/asm/switch_to.h @@ -0,0 +1,7 @@ +#ifndef _ASM_UM_SWITCH_TO_H_ +#define _ASM_UM_SWITCH_TO_H_ + +extern void *_switch_to(void *prev, void *next, void *last); +#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) + +#endif diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h deleted file mode 100644 index a459fd9b7598..000000000000 --- a/arch/x86/um/asm/system.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef _ASM_X86_SYSTEM_H_ -#define _ASM_X86_SYSTEM_H_ - -#include -#include -#include -#include -#include - -#include -#include - -/* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION -# define AT_VECTOR_SIZE_ARCH 2 -#else -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -extern unsigned long arch_align_stack(unsigned long sp); - -void default_idle(void); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -extern void *_switch_to(void *prev, void *next, void *last); -#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) - -#endif -- cgit v1.2.3 From 76b278edd99fb55525fcf2706095e388bd3d122c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 29 Mar 2012 19:10:42 +0200 Subject: um: Use asm-generic/switch_to.h Signed-off-by: Richard Weinberger --- arch/um/include/asm/Kbuild | 1 + arch/um/kernel/process.c | 6 +----- arch/x86/um/asm/switch_to.h | 7 ------- 3 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 arch/x86/um/asm/switch_to.h (limited to 'arch/x86') diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index bb5d6e68829b..fff24352255d 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,3 +1,4 @@ generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h +generic-y += switch_to.h diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index f386d04a84a5..2b73dedb44ca 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -88,11 +88,8 @@ static inline void set_current(struct task_struct *task) extern void arch_switch_to(struct task_struct *to); -void *_switch_to(void *prev, void *next, void *last) +void *__switch_to(struct task_struct *from, struct task_struct *to) { - struct task_struct *from = prev; - struct task_struct *to = next; - to->thread.prev_sched = from; set_current(to); @@ -111,7 +108,6 @@ void *_switch_to(void *prev, void *next, void *last) } while (current->thread.saved_task); return current->thread.prev_sched; - } void interrupt_end(void) diff --git a/arch/x86/um/asm/switch_to.h b/arch/x86/um/asm/switch_to.h deleted file mode 100644 index cf97d20da61f..000000000000 --- a/arch/x86/um/asm/switch_to.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_UM_SWITCH_TO_H_ -#define _ASM_UM_SWITCH_TO_H_ - -extern void *_switch_to(void *prev, void *next, void *last); -#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) - -#endif -- cgit v1.2.3 From 92ae03f2ef99fbc23bfa9080d6b58f25227bd7ef Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 Apr 2012 14:32:32 -0700 Subject: x86: merge 32/64-bit versions of 'strncpy_from_user()' and speed it up This merges the 32- and 64-bit versions of the x86 strncpy_from_user() by just rewriting it in C rather than the ancient inline asm versions that used lodsb/stosb and had been duplicated for (trivial) differences between the 32-bit and 64-bit versions. While doing that, it also speeds them up by doing the accesses a word at a time. Finally, the new routines also properly handle the case of hitting the end of the address space, which we have never done correctly before (fs/namei.c has a hack around it for that reason). Despite all these improvements, it actually removes more lines than it adds, due to the de-duplication. Also, we no longer export (or define) the legacy __strncpy_from_user() function (that was defined to not do the user permission checks), since it's not actually used anywhere, and the user address space checks are built in to the new code. Other architecture maintainers have been notified that the old hack in fs/namei.c will be going away in the 3.5 merge window, in case they copied the x86 approach of being a bit cavalier about the end of the address space. Cc: linux-arch@vger.kernel.org Cc: Ingo Molnar Cc: Peter Anvin" Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 2 + arch/x86/include/asm/uaccess_32.h | 5 -- arch/x86/include/asm/uaccess_64.h | 4 -- arch/x86/lib/usercopy.c | 103 ++++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_32.c | 87 -------------------------------- arch/x86/lib/usercopy_64.c | 49 ------------------ 6 files changed, 105 insertions(+), 145 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8be5f54d9360..e0544597cfe7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -557,6 +557,8 @@ struct __large_struct { unsigned long buf[100]; }; extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); +extern __must_check long +strncpy_from_user(char *dst, const char __user *src, long count); /* * movsl can be slow when source and dest are not both 8-byte aligned diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 566e803cc602..8084bc73b18c 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,11 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -long __must_check strncpy_from_user(char *dst, const char __user *src, - long count); -long __must_check __strncpy_from_user(char *dst, - const char __user *src, long count); - /** * strlen_user: - Get the size of a string in user space. * @str: The string to measure. diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 1c66d30971ad..fcd4b6f3ef02 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,10 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check long -strncpy_from_user(char *dst, const char __user *src, long count); -__must_check long -__strncpy_from_user(char *dst, const char __user *src, long count); __must_check long strnlen_user(const char __user *str, long n); __must_check long __strnlen_user(const char __user *str, long n); __must_check long strlen_user(const char __user *str); diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 97be9cb54483..57252c928f56 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include #include +#include + /* * best effort, GUP based copy_from_user() that is NMI-safe */ @@ -41,3 +43,104 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +static inline unsigned long count_bytes(unsigned long mask) +{ + mask = (mask - 1) & ~mask; + mask >>= 7; + return count_masked_bytes(mask); +} + +/* + * Do a strncpy, return length of string without final '\0'. + * 'count' is the user-supplied count (return 'count' if we + * hit it), 'max' is the address space maximum (and we return + * -EFAULT if we hit it). + */ +static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, long max) +{ + long res = 0; + + /* + * Truncate 'max' to the user-specified limit, so that + * we only have one limit we need to check in the loop + */ + if (max > count) + max = count; + + while (max >= sizeof(unsigned long)) { + unsigned long c; + + /* Fall back to byte-at-a-time if we get a page fault */ + if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) + break; + /* This can write a few bytes past the NUL character, but that's ok */ + *(unsigned long *)(dst+res) = c; + c = has_zero(c); + if (c) + return res + count_bytes(c); + res += sizeof(unsigned long); + max -= sizeof(unsigned long); + } + + while (max) { + char c; + + if (unlikely(__get_user(c,src+res))) + return -EFAULT; + dst[res] = c; + if (!c) + return res; + res++; + max--; + } + + /* + * Uhhuh. We hit 'max'. But was that the user-specified maximum + * too? If so, that's ok - we got as much as the user asked for. + */ + if (res >= count) + return count; + + /* + * Nope: we hit the address space limit, and we still had more + * characters the caller would have wanted. That's an EFAULT. + */ + return -EFAULT; +} + +/** + * strncpy_from_user: - Copy a NUL terminated string from userspace. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Source address, in user space. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from userspace to kernel space. + * + * On success, returns the length of the string (not including the trailing + * NUL). + * + * If access to userspace fails, returns -EFAULT (some data may have been + * copied). + * + * If @count is smaller than the length of the string, copies @count bytes + * and returns @count. + */ +long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + unsigned long max_addr, src_addr; + + if (unlikely(count <= 0)) + return 0; + + max_addr = current_thread_info()->addr_limit.seg; + src_addr = (unsigned long)src; + if (likely(src_addr < max_addr)) { + unsigned long max = max_addr - src_addr; + return do_strncpy_from_user(dst, src, count, max); + } + return -EFAULT; +} +EXPORT_SYMBOL(strncpy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index d9b094ca7aaa..ef2a6a5d78e3 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -32,93 +32,6 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define movsl_is_ok(a1, a2, n) \ __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n)) -/* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst, src, count, res) \ -do { \ - int __d0, __d1, __d2; \ - might_fault(); \ - __asm__ __volatile__( \ - " testl %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decl %1\n" \ - " jnz 0b\n" \ - "1: subl %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -/** - * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * Caller must check the specified block with access_ok() before calling - * this function. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -/** - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - /* * Zero Userspace */ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index b7c2849ffb66..0d0326f388c0 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -8,55 +8,6 @@ #include #include -/* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - long __d0, __d1, __d2; \ - might_fault(); \ - __asm__ __volatile__( \ - " testq %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decq %1\n" \ - " jnz 0b\n" \ - "1: subq %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movq %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(0b,3b) \ - : "=&r"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - return __strncpy_from_user(dst, src, count); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - /* * Zero Userspace */ -- cgit v1.2.3