From 0f60a8efe4005ab5e65ce000724b04d4ca04a199 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jul 2016 16:19:48 -0700 Subject: mm: Implement stack frame object validation This creates per-architecture function arch_within_stack_frames() that should validate if a given object is contained by a kernel stack frame. Initial implementation is on x86. This is based on code from PaX. Signed-off-by: Kees Cook --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d9a94da0c29f..a2865ddfc1ff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -91,6 +91,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 30c133ac05cd..ab386f1336f2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -180,6 +180,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 5b710f34e194c6b7710f69fdb5d798fdf35b98c1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jun 2016 15:04:01 -0700 Subject: x86/uaccess: Enable hardened usercopy Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in copy_*_user() and __copy_*_user() because copy_*_user() actually calls down to _copy_*_user() and not __copy_*_user(). Based on code from PaX and grsecurity. Signed-off-by: Kees Cook Tested-by: Valdis Kletnieks --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 10 ++++++---- arch/x86/include/asm/uaccess_32.h | 2 ++ arch/x86/include/asm/uaccess_64.h | 2 ++ 4 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a2865ddfc1ff..9640942b68b9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -80,6 +80,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2982387ba817..d3312f0fcdfc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -742,9 +742,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) * case, and do only runtime checking for non-constant sizes. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_from_user_overflow(); else __copy_from_user_overflow(sz, n); @@ -762,9 +763,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) + } else if (__builtin_constant_p(n)) copy_to_user_overflow(); else __copy_to_user_overflow(sz, n); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 4b32da24faaf..7d3bdd1ed697 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); return __copy_to_user_ll(to, from, n); } @@ -95,6 +96,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 2eac2aa3e37f..673059a109fe 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -54,6 +54,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -119,6 +120,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { -- cgit v1.2.3 From 65ea11ec6a82b1d44aba62b59e9eb20247e57c6e Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 8 Aug 2016 20:35:29 +0300 Subject: x86/hweight: Don't clobber %rdi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The caller expects %rdi to remain intact, push+pop it make that happen. Fixes the following kind of explosions on my core2duo machine when trying to reboot or shut down: general protection fault: 0000 [#1] PREEMPT SMP Modules linked in: i915 i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm netconsole configfs binfmt_misc iTCO_wdt psmouse pcspkr snd_hda_codec_idt e100 coretemp hwmon snd_hda_codec_generic i2c_i801 mii i2c_smbus lpc_ich mfd_core snd_hda_intel uhci_hcd snd_hda_codec snd_hwdep snd_hda_core ehci_pci 8250 ehci_hcd snd_pcm 8250_base usbcore evdev serial_core usb_common parport_pc parport snd_timer snd soundcore CPU: 0 PID: 3070 Comm: reboot Not tainted 4.8.0-rc1-perf-dirty #69 Hardware name: /D946GZIS, BIOS TS94610J.86A.0087.2007.1107.1049 11/07/2007 task: ffff88012a0b4080 task.stack: ffff880123850000 RIP: 0010:[] [] x86_perf_event_update+0x52/0xc0 RSP: 0018:ffff880123853b60 EFLAGS: 00010087 RAX: 0000000000000001 RBX: ffff88012fc0a3c0 RCX: 000000000000001e RDX: 0000000000000000 RSI: 0000000040000000 RDI: ffff88012b014800 RBP: ffff880123853b88 R08: ffffffffffffffff R09: 0000000000000000 R10: ffffea0004a012c0 R11: ffffea0004acedc0 R12: ffffffff80000001 R13: ffff88012b0149c0 R14: ffff88012b014800 R15: 0000000000000018 FS: 00007f8b155cd700(0000) GS:ffff88012fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8b155f5000 CR3: 000000012a2d7000 CR4: 00000000000006f0 Stack: ffff88012fc0a3c0 ffff88012b014800 0000000000000004 0000000000000001 ffff88012fc1b750 ffff880123853bb0 ffffffff81003d59 ffff88012b014800 ffff88012fc0a3c0 ffff88012b014800 ffff880123853bd8 ffffffff81003e13 Call Trace: [] x86_pmu_stop+0x59/0xd0 [] x86_pmu_del+0x43/0x140 [] event_sched_out.isra.105+0xbd/0x260 [] __perf_remove_from_context+0x2d/0xb0 [] __perf_event_exit_context+0x4d/0x70 [] generic_exec_single+0xb6/0x140 [] ? __perf_remove_from_context+0xb0/0xb0 [] ? __perf_remove_from_context+0xb0/0xb0 [] smp_call_function_single+0xdf/0x140 [] perf_event_exit_cpu_context+0x87/0xc0 [] perf_reboot+0x13/0x40 [] notifier_call_chain+0x4a/0x70 [] __blocking_notifier_call_chain+0x47/0x60 [] blocking_notifier_call_chain+0x16/0x20 [] kernel_restart_prepare+0x1d/0x40 [] kernel_restart+0x12/0x60 [] SYSC_reboot+0xf6/0x1b0 [] ? mntput_no_expire+0x2c/0x1b0 [] ? mntput+0x24/0x40 [] ? __fput+0x16c/0x1e0 [] ? ____fput+0xe/0x10 [] ? task_work_run+0x83/0xa0 [] ? exit_to_usermode_loop+0x53/0xc0 [] ? trace_hardirqs_on_thunk+0x1a/0x1c [] SyS_reboot+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x18/0xa3 Code: 7c 4c 8d af c0 01 00 00 49 89 fe eb 10 48 09 c2 4c 89 e0 49 0f b1 55 00 4c 39 e0 74 35 4d 8b a6 c0 01 00 00 41 8b 8e 60 01 00 00 <0f> 33 8b 35 6e 02 8c 00 48 c1 e2 20 85 f6 7e d2 48 89 d3 89 cf RIP [] x86_perf_event_update+0x52/0xc0 RSP ---[ end trace 7ec95181faf211be ]--- note: reboot[3070] exited with preempt_count 2 Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Fixes: f5967101e9de ("x86/hweight: Get rid of the special calling convention") Signed-off-by: Ville Syrjälä Signed-off-by: Linus Torvalds --- arch/x86/lib/hweight.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 02de3d74d2c5..8a602a1e404a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32) ENTRY(__sw_hweight64) #ifdef CONFIG_X86_64 + pushq %rdi pushq %rdx movq %rdi, %rdx # w -> t @@ -60,6 +61,7 @@ ENTRY(__sw_hweight64) shrq $56, %rax # w = w_tmp >> 56 popq %rdx + popq %rdi ret #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ -- cgit v1.2.3 From 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Aug 2016 13:02:01 -0700 Subject: unsafe_[get|put]_user: change interface to use a error target label When I initially added the unsafe_[get|put]_user() helpers in commit 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses"), I made the mistake of modeling the interface on our traditional __[get|put]_user() functions, which return zero on success, or -EFAULT on failure. That interface is fairly easy to use, but it's actually fairly nasty for good code generation, since it essentially forces the caller to check the error value for each access. In particular, since the error handling is already internally implemented with an exception handler, and we already use "asm goto" for various other things, we could fairly easily make the error cases just jump directly to an error label instead, and avoid the need for explicit checking after each operation. So switch the interface to pass in an error label, rather than checking the error value in the caller. Best do it now before we start growing more users (the signal handling code in particular would be a good place to use the new interface). So rather than if (unsafe_get_user(x, ptr)) ... handle error .. the interface is now unsafe_get_user(x, ptr, label); where an error during the user mode fetch will now just cause a jump to 'label' in the caller. Right now the actual _implementation_ of this all still ends up being a "if (err) goto label", and does not take advantage of any exception label tricks, but for "unsafe_put_user()" in particular it should be fairly straightforward to convert to using the exception table model. Note that "unsafe_get_user()" is much harder to convert to a clever exception table model, because current versions of gcc do not allow the use of "asm goto" (for the exception) with output values (for the actual value to be fetched). But that is hopefully not a limitation in the long term. [ Also note that it might be a good idea to switch unsafe_get_user() to actually _return_ the value it fetches from user space, but this commit only changes the error handling semantics ] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 16 ++++++++-------- include/linux/uaccess.h | 4 ++-- lib/strncpy_from_user.c | 8 ++++---- lib/strnlen_user.c | 7 +++---- 4 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c03bfb68c503..52f230094c51 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -812,21 +812,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 349557825428..f30c187ed785 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) -#define unsafe_get_user(x, ptr) __get_user(x, ptr) -#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) +#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33f655ef48cd..9c5fe8110413 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -40,8 +40,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - break; + unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); + *(unsigned long *)(dst+res) = c; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); @@ -56,8 +56,7 @@ byte_at_a_time: while (max) { char c; - if (unlikely(unsafe_get_user(c,src+res))) - return -EFAULT; + unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; @@ -76,6 +75,7 @@ byte_at_a_time: * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ +efault: return -EFAULT; } diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 2625943625d7..8e105ed4df12 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, src -= align; max += align; - if (unlikely(unsafe_get_user(c,(unsigned long __user *)src))) - return 0; + unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { @@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); - if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res)))) - return 0; + unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; @@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ +efault: return 0; } -- cgit v1.2.3 From e4630fdd47637168927905983205d7b7c5c08c09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Aug 2016 15:31:31 +0200 Subject: x86/power/64: Always create temporary identity mapping correctly The low-level resume-from-hibernation code on x86-64 uses kernel_ident_mapping_init() to create the temoprary identity mapping, but that function assumes that the offset between kernel virtual addresses and physical addresses is aligned on the PGD level. However, with a randomized identity mapping base, it may be aligned on the PUD level and if that happens, the temporary identity mapping created by set_up_temporary_mappings() will not reflect the actual kernel identity mapping and the image restoration will fail as a result (leading to a kernel panic most of the time). To fix this problem, rework kernel_ident_mapping_init() to support unaligned offsets between KVA and PA up to the PMD level and make set_up_temporary_mappings() use it as approprtiate. Reported-and-tested-by: Thomas Garnier Reported-by: Borislav Petkov Suggested-by: Yinghai Lu Signed-off-by: Rafael J. Wysocki Acked-by: Yinghai Lu --- arch/x86/include/asm/init.h | 4 ++-- arch/x86/mm/ident_map.c | 19 +++++++++++-------- arch/x86/power/hibernate_64.c | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 223042086f4e..737da62bfeb0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,10 +5,10 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long pmd_flag; /* page flag for PMD entry */ - bool kernel_mapping; /* kernel mapping or ident mapping */ + unsigned long offset; /* ident mapping offset */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end); + unsigned long pstart, unsigned long pend); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ec21796ac5fd..4473cb4f8b90 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -3,15 +3,17 @@ * included by both the compressed kernel and the regular kernel. */ -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, unsigned long addr, unsigned long end) { addr &= PMD_MASK; for (; addr < end; addr += PMD_SIZE) { pmd_t *pmd = pmd_page + pmd_index(addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); } } @@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); continue; } pmd = (pmd_t *)info->alloc_pgt_page(info->context); if (!pmd) return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } @@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, } int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) + unsigned long pstart, unsigned long pend) { + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; unsigned long next; int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pgd_t *pgd = pgd_page + pgd_index(addr); pud_t *pud; next = (addr & PGDIR_MASK) + PGDIR_SIZE; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f0b5f2d402af..a3e3ccc87138 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -87,7 +87,7 @@ static int set_up_temporary_mappings(void) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - .kernel_mapping = true, + .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; pgd_t *pgd; -- cgit v1.2.3 From 1a9e4c564ab174e53ed86def922804a5ddc63e7d Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 14 Jul 2016 17:22:54 +0200 Subject: x86/timers/apic: Fix imprecise timer interrupts by eliminating TSC clockevents frequency roundoff error I noticed the following bug/misbehavior on certain Intel systems: with a single task running on a NOHZ CPU on an Intel Haswell, I recognized that I did not only get the one expected local_timer APIC interrupt, but two per second at minimum. (!) Further tracing showed that the first one precedes the programmed deadline by up to ~50us and hence, it did nothing except for reprogramming the TSC deadline clockevent device to trigger shortly thereafter again. The reason for this is imprecise calibration, the timeout we program into the APIC results in 'too short' timer interrupts. The core (hr)timer code notices this (because it has a precise ktime source and sees the short interrupt) and fixes it up by programming an additional very short interrupt period. This is obviously suboptimal. The reason for the imprecise calibration is twofold, and this patch fixes the first reason: In setup_APIC_timer(), the registered clockevent device's frequency is calculated by first dividing tsc_khz by TSC_DIVISOR and multiplying it with 1000 afterwards: (tsc_khz / TSC_DIVISOR) * 1000 The multiplication with 1000 is done for converting from kHz to Hz and the division by TSC_DIVISOR is carried out in order to make sure that the final result fits into an u32. However, with the order given in this calculation, the roundoff error introduced by the division gets magnified by a factor of 1000 by the following multiplication. To fix it, reversing the order of the division and the multiplication a la: (tsc_khz * 1000) / TSC_DIVISOR ... reduces the roundoff error already. Furthermore, if TSC_DIVISOR divides 1000, associativity holds: (tsc_khz * 1000) / TSC_DIVISOR = tsc_khz * (1000 / TSC_DIVISOR) and thus, the roundoff error even vanishes and the whole operation can be carried out within 32 bits. The powers of two that divide 1000 are 2, 4 and 8. A value of 8 for TSC_DIVISOR still allows for TSC frequencies up to 2^32 / 10^9ns * 8 = 34.4GHz which is way larger than anything to expect in the next years. Thus we also replace the current TSC_DIVISOR value of 32 by 8. Reverse the order of the divison and the multiplication in the calculation of the registered clockevent device's frequency. Signed-off-by: Nicolai Stange Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Acked-by: Thomas Gleixner Cc: Adrian Hunter Cc: Borislav Petkov Cc: Christopher S. Hall Cc: H. Peter Anvin Cc: Hidehiro Kawai Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20160714152255.18295-2-nicstange@gmail.com [ Improved changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ac8d8ad8b009..a315dc404756 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -313,7 +313,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -565,7 +565,7 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); -- cgit v1.2.3 From 6731b0d611a1274f9e785fa0189ac2aeeabd0591 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 14 Jul 2016 17:22:55 +0200 Subject: x86/timers/apic: Inform TSC deadline clockevent device about recalibration This patch eliminates a source of imprecise APIC timer interrupts, which imprecision may result in double interrupts or even late interrupts. The TSC deadline clockevent devices' configuration and registration happens before the TSC frequency calibration is refined in tsc_refine_calibration_work(). This results in the TSC clocksource and the TSC deadline clockevent devices being configured with slightly different frequencies: the former gets the refined one and the latter are configured with the inaccurate frequency detected earlier by means of the "Fast TSC calibration using PIT". Within the APIC code, introduce the notifier function lapic_update_tsc_freq() which reconfigures all per-CPU TSC deadline clockevent devices with the current tsc_khz. Call it from the TSC code after TSC calibration refinement has happened. Signed-off-by: Nicolai Stange Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Adrian Hunter Cc: Borislav Petkov Cc: Christopher S. Hall Cc: H. Peter Anvin Cc: Hidehiro Kawai Cc: Len Brown Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Viresh Kumar Link: http://lkml.kernel.org/r/20160714152255.18295-3-nicstange@gmail.com [ Pushed #ifdef CONFIG_X86_LOCAL_APIC into header, improved changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++ arch/x86/kernel/tsc.c | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5befd4945f2..124357773ffa 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); +extern void lapic_update_tsc_freq(void); extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop +static inline void lapic_update_tsc_freq(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a315dc404756..0fd3d659f13c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -571,6 +571,30 @@ static void setup_APIC_timer(void) clockevents_register_device(levt); } +/* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + /* * In this functions we calibrate APIC bus clocks to the external timer. * diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a804b5ab32d0..8fb4b6abac0e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -22,6 +22,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1249,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; -- cgit v1.2.3 From 469f00231278da68062a809306df0bac95a27507 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 15 Jul 2016 11:42:43 +0200 Subject: x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text Dmitry Vyukov has reported unexpected KASAN stackdepot growth: https://github.com/google/kasan/issues/36 ... which is caused by the APIC handlers not being present in .irqentry.text: When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the APIC interrupt handlers into the .irqentry.text section. This is needed because both KASAN and function graph tracer use __irqentry_text_start and __irqentry_text_end to determine whether a function is an IRQ entry point. Reported-by: Dmitry Vyukov Signed-off-by: Alexander Potapenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: aryabinin@virtuozzo.com Cc: kasan-dev@googlegroups.com Cc: kcc@google.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com [ Minor edits. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875aeea6..9f85827db24e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -601,9 +601,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) .endm #endif +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +# define POP_SECTION_IRQENTRY .popsection +#else +# define PUSH_SECTION_IRQENTRY +# define POP_SECTION_IRQENTRY +#endif + .macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym trace_apicinterrupt \num \sym +POP_SECTION_IRQENTRY .endm #ifdef CONFIG_SMP -- cgit v1.2.3 From 22cc1ca3c5469cf17e149be232817b9223afa5e4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Aug 2016 21:54:53 +0200 Subject: x86/hpet: Fix /dev/rtc breakage caused by RTC cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ville Syrjälä reports "The first time I run hwclock after rebooting I get this: open("/dev/rtc", O_RDONLY) = 3 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = 0 select(4, [3], NULL, NULL, {10, 0}) = 0 (Timeout) ioctl(3, PHN_NOT_OH or RTC_UIE_OFF, 0) = 0 close(3) = 0 On all subsequent runs I get this: open("/dev/rtc", O_RDONLY) = 3 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = -1 EINVAL (Invalid argument) ioctl(3, RTC_RD_TIME, 0x7ffd76b3ae70) = -1 EINVAL (Invalid argument) close(3) = 0" This was caused by a stupid typo in a patch that should have been a simple rename to move around contents of a header file, but accidentally wrote zeroes into the rtc rather than reading from it: 463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h") Reported-by: Ville Syrjälä Tested-by: Jarkko Nikula Tested-by: Ville Syrjälä Signed-off-by: Arnd Bergmann Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: Borislav Petkov Cc: Geert Uytterhoeven Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: rtc-linux@googlegroups.com Fixes: 463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h") Link: http://lkml.kernel.org/r/20160809195528.1604312-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ed16e58658a4..c6dfd801df97 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1242,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_set_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { -- cgit v1.2.3 From c7d2361f7524f365c1ae42f47880e3fa9efb2c2a Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 9 Aug 2016 10:11:04 -0700 Subject: x86/mm/KASLR: Fix physical memory calculation on KASLR memory randomization Initialize KASLR memory randomization after max_pfn is initialized. Also ensure the size is rounded up. It could create problems on machines with more than 1Tb of memory on certain random addresses. Signed-off-by: Thomas Garnier Cc: Aleksey Makarov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Fabian Frederick Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions") Link: http://lkml.kernel.org/r/1470762665-88032-1-git-send-email-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++-- arch/x86/mm/kaslr.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 991b77986d57..95cf31c9f4ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); - kernel_randomize_memory(); - iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); @@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 26dccd6c0df1..ec8654f117d8 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void) * add padding if needed (especially for memory hotplug support). */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ -- cgit v1.2.3 From fb754f958f8e46202c1efd7f66d5b3db1208117d Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 9 Aug 2016 10:11:05 -0700 Subject: x86/mm/KASLR: Increase BRK pages for KASLR memory randomization Default implementation expects 6 pages maximum are needed for low page allocations. If KASLR memory randomization is enabled, the worse case of e820 layout would require 12 pages (no large pages). It is due to the PUD level randomization and the variable e820 memory layout. This bug was found while doing extensive testing of KASLR memory randomization on different type of hardware. Signed-off-by: Thomas Garnier Cc: Aleksey Makarov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Christian Borntraeger Cc: Dan Williams Cc: Dave Hansen Cc: Dave Young Cc: Denys Vlasenko Cc: Fabian Frederick Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Lv Zheng Cc: Mark Salter Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: kernel-hardening@lists.openwall.com Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions") Link: http://lkml.kernel.org/r/1470762665-88032-2-git-send-email-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 620928903be3..d28a2d741f9e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { -- cgit v1.2.3 From 5cf0791da5c162ebc14b01eb01631cfa7ed4fa6e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 5 Aug 2016 15:37:39 +0200 Subject: x86/mm: Disable preemption during CR3 read+write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a subtle preemption race on UP kernels: Usually current->mm (and therefore mm->pgd) stays the same during the lifetime of a task so it does not matter if a task gets preempted during the read and write of the CR3. But then, there is this scenario on x86-UP: TaskA is in do_exit() and exit_mm() sets current->mm = NULL followed by: -> mmput() -> exit_mmap() -> tlb_finish_mmu() -> tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush() -> flush_tlb_mm_range() -> __flush_tlb_up() -> __flush_tlb() -> __native_flush_tlb() At this point current->mm is NULL but current->active_mm still points to the "old" mm. Let's preempt taskA _after_ native_read_cr3() by taskB. TaskB has its own mm so CR3 has changed. Now preempt back to taskA. TaskA has no ->mm set so it borrows taskB's mm and so CR3 remains unchanged. Once taskA gets active it continues where it was interrupted and that means it writes its old CR3 value back. Everything is fine because userland won't need its memory anymore. Now the fun part: Let's preempt taskA one more time and get back to taskB. This time switch_mm() won't do a thing because oldmm (->active_mm) is the same as mm (as per context_switch()). So we remain with a bad CR3 / PGD and return to userland. The next thing that happens is handle_mm_fault() with an address for the execution of its code in userland. handle_mm_fault() realizes that it has a PTE with proper rights so it returns doing nothing. But the CPU looks at the wrong PGD and insists that something is wrong and faults again. And again. And one more time… This pagefault circle continues until the scheduler gets tired of it and puts another task on the CPU. It gets little difficult if the task is a RT task with a high priority. The system will either freeze or it gets fixed by the software watchdog thread which usually runs at RT-max prio. But waiting for the watchdog will increase the latency of the RT task which is no good. Fix this by disabling preemption across the critical code section. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1470404259-26290-1-git-send-email-bigeasy@linutronix.de [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..6fa85944af83 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) -- cgit v1.2.3 From 3e035305875cfa8a58c1ca573d0cfa6a7f201f27 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Aug 2016 19:14:29 +0200 Subject: x86/entry: Clarify the RF saving/restoring situation with SYSCALL/SYSRET Clarify why exactly RF cannot be restored properly by SYSRET to avoid confusion. No functionality change. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160803171429.GA2590@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9f85827db24e..d172c619c449 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,11 +288,15 @@ return_from_SYSCALL_64: jne opportunistic_sysret_failed /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: * * movq $stuck_here, %rcx * pushfq -- cgit v1.2.3 From 054f621fd5b1c7245710f5d3935c94ce6ae4b3b7 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:50 -0500 Subject: x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous The UV4 Socket IDs are not guaranteed to equate to Node values which can cause the GAM (Global Addressable Memory) table lookups to fail. Fix this by using an independent index into the GAM table instead of the Socket ID to reference the base address. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.048755337@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 09b59adaea3f..d9187336aa2e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -325,7 +325,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +356,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +372,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; -- cgit v1.2.3 From e363d24c2b997c421476c6aa00547edadf678efe Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:51 -0500 Subject: x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table causing a crash Save the uv_systab::size field before doing the iounmap() of the struct pointer, to avoid a NULL dereference crash. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.250424783@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/bios_uv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..4e9fd1378aec 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -199,12 +199,14 @@ void uv_bios_init(void) return; } + /* Starting with UV4 the UV systab size is variable */ if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + int size = uv_systab->size; + iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, uv_systab->size); + uv_systab = ioremap(efi.uv_systab, size); if (!uv_systab) { - pr_err("UV: UVsystab: ioremap(%d) failed!\n", - uv_systab->size); + pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; } } -- cgit v1.2.3 From 22ac2bca92f2d92c6495248d65ff648182df428d Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:52 -0500 Subject: x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM values There are some circumstances where the UV4 BIOS cannot provide the correct Proximity Node values to associate with specific Sockets and Physical Nodes. The decision was made to remove these values from BIOS and for the kernel to get these values from the standard ACPI tables. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.414210079@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/bios.h | 5 +++-- arch/x86/kernel/apic/x2apic_uv_x.c | 28 ++++++++++------------------ 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index c852590254d5..e652a7cc6186 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -79,7 +79,7 @@ struct uv_gam_range_entry { u16 nasid; /* HNasid */ u16 sockid; /* Socket ID, high bits of APIC ID */ u16 pnode; /* Index to MMR and GRU spaces */ - u32 pxm; /* ACPI proximity domain number */ + u32 unused2; u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ }; @@ -88,7 +88,8 @@ struct uv_gam_range_entry { #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ -#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 +#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d9187336aa2e..6aa0545879bb 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -1156,19 +1156,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1287,7 +1286,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1295,20 +1294,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1319,14 +1316,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ -- cgit v1.2.3 From 5a52e8f822374bebc702bb2688ed8b5515bbb55b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 1 Aug 2016 13:40:53 -0500 Subject: x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV systems The latest UV kernel support panics when RHEL7 kexec's the kdump kernel to make a dumpfile. This patch fixes the problem by turning off all UV support if NUMA is off. Tested-by: Frank Ramsay Tested-by: John Estabrook Signed-off-by: Mike Travis Reviewed-by: Dimitri Sivanich Reviewed-by: Nathan Zimmer Cc: Alex Thorlton Cc: Andrew Banman Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801184050.577755634@asylum.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6aa0545879bb..cb0673c1e940 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; -- cgit v1.2.3 From 5e44258d168b2bdee51d9e2e1f1f4726ff9775cd Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Sun, 31 Jul 2016 23:24:50 -0400 Subject: x86/build: Reduce the W=1 warnings noise when compiling x86 syscall tables Building an X86_64 kernel with W=1 throws a total of 9,948 lines of warnings of this form for both 32-bit and 64-bit syscall tables. Given that the entire rest of the build for my config only generates 8,375 lines of output, this is a big reduction in the warnings generated. The warnings follow this pattern: ./arch/x86/include/generated/asm/syscalls_32.h:885:21: warning: initialized field overwritten [-Woverride-init] __SYSCALL_I386(379, compat_sys_pwritev2, ) ^ arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386' #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, ^~~ ./arch/x86/include/generated/asm/syscalls_32.h:885:21: note: (near initialization for 'ia32_sys_call_table[379]') __SYSCALL_I386(379, compat_sys_pwritev2, ) ^ arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386' #define __SYSCALL_I386(nr, sym, qual) [nr] = sym, Since we intentionally build the syscall tables this way, ignore that one warning in the two files. Signed-off-by: Valdis Kletnieks Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7464.1470021890@turing-police.cc.vt.edu Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fe91c25092da..77f28ce9c646 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -5,6 +5,8 @@ OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +CFLAGS_syscall_64.o += -Wno-override-init +CFLAGS_syscall_32.o += -Wno-override-init obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o -- cgit v1.2.3 From b79daf85899215d5ede3641806db2e2a77b776b4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 27 Jul 2016 16:20:40 -0700 Subject: x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation The Memory Protection Keys "rights register" (PKRU) is XSAVE-managed, and is saved/restored along with the FPU state. When kernel code accesses FPU regsisters, it does a delicate dance with preempt. Otherwise, the context switching code can get confused as to whether the most up-to-date state is in the registers themselves or in the XSAVE buffer. But, PKRU is not a normal FPU register. Using it does not generate the normal device-not-available (#NM) exceptions which means we can not manage it lazily, and the kernel completley disallows using lazy mode when it is enabled. The dance with preempt *only* occurs when managing the FPU lazily. Since we never manage PKRU lazily, we do not have to do the dance with preempt; we can access it directly. Doing it this way saves a ton of complicated code (and is faster too). Further, the XSAVES reenabling failed to patch a bit of code in fpu__xfeature_set_state() the checked for compacted buffers. That check caused fpu__xfeature_set_state() to silently refuse to work when the kernel is using compacted XSAVE buffers. This broke execute-only and future pkey_mprotect() support when using compact XSAVE buffers. But, removing fpu__xfeature_set_state() gets rid of this issue, in addition to the nice cleanup and speedup. This fixes the same thing as a fix that Sai posted: https://lkml.org/lkml/2016/7/25/637 The fix that he posted is a much more obviously correct, but I think we should just do this instead. Reported-by: Sai Praneeth Prakhya Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi Shankar Cc: Thomas Gleixner Cc: Yu-Cheng Yu Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 138 ++++++------------------------------------- 1 file changed, 17 insertions(+), 121 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 680049aa4593..01567aa87503 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } - -/* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. - */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) -{ - xsave->header.xfeatures |= xstate_feature_mask; -} - -/* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) -{ - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; - - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; - } - - /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. - */ - fpu__current_fpstate_write_begin(); - - /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. - */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); - - /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. - */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); - - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); - - /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. - */ - fpu__current_fpstate_write_end(); -} - #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; + u32 old_pkru; int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); u32 new_pkru_bits = 0; @@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) @@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer: */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); - - /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. - */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* Mask off any old bits in place: */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - - /* Set the newly-requested bits: */ - new_pkru_state.pkru |= new_pkru_bits; - - /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. - */ - new_pkru_state.pad = 0; + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); return 0; } -- cgit v1.2.3 From 62d16b5a3fca4d186e13215e0d7d2f6d36191796 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sat, 6 Aug 2016 12:20:39 +0200 Subject: x86/mm/kaslr: Fix -Wformat-security warning debug_putstr() is used to output strings without using printf-like formatting but debug_putstr(v) is defined as early_printk(v) in arch/x86/lib/kaslr.c. This makes clang reports the following warning when building with -Wformat-security: arch/x86/lib/kaslr.c:57:15: warning: format string is not a string literal (potentially insecure) [-Wformat-security] debug_putstr(purpose); ^~~~~~~ Fix this by using "%s" in early_printk(). Signed-off-by: Nicolas Iooss Acked-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160806102039.27221-1-nicolas.iooss_linux@m4x.org Signed-off-by: Ingo Molnar --- arch/x86/lib/kaslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index f7dfeda83e5c..121f59c6ee54 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -19,7 +19,7 @@ #include #include -#define debug_putstr(v) early_printk(v) +#define debug_putstr(v) early_printk("%s", v) #define has_cpuflag(f) boot_cpu_has(f) #define get_boot_seed() kaslr_offset() #endif -- cgit v1.2.3 From ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Aug 2016 10:23:25 -0700 Subject: x86/mm: Fix swap entry comment and macro A recent patch changed the format of a swap PTE. The comment explaining the format of the swap PTE is wrong about the bits used for the swap type field. Amusingly, the ASCII art and the patch description are correct, but the comment itself is wrong. As I was looking at this, I also noticed that the SWP_OFFSET_FIRST_BIT has an off-by-one error. This does not really hurt anything. It just wasted a bit of space in the PTE, giving us 2^59 bytes of addressable space in our swapfiles instead of 2^60. But, it doesn't match with the comments, and it wastes a bit of space, so fix it. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum") Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 7e8ec7ae10fa..1cc82ece9ac1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 /* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -- cgit v1.2.3 From 82ba4faca1bffad429f15c90c980ffd010366c25 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 11 Aug 2016 15:44:30 +0800 Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count Since commit: 52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR") the TLB remote shootdown is done through call function vector. That commit didn't take care of irq_tlb_count, which a later commit: fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts") ... tried to fix. The fix assumes every increase of irq_tlb_count has a corresponding increase of irq_call_count. So the irq_call_count is always bigger than irq_tlb_count and we could substract irq_tlb_count from irq_call_count. Unfortunately this is not true for the smp_call_function_single() case. The IPI is only sent if the target CPU's call_single_queue is empty when adding a csd into it in generic_exec_single. That means if two threads are both adding flush tlb csds to the same CPU's call_single_queue, only one IPI is sent. In other words, the irq_call_count is incremented by 1 but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be bigger than irq_call_count and the substract will produce a very large irq_call_count value due to overflow. Considering that: 1) it's not worth to send more IPIs for the sake of accurate counting of irq_call_count in generic_exec_single(); 2) it's not easy to tell if the call function interrupt is for TLB shootdown in __smp_call_function_single_interrupt(). Not to exclude TLB shootdown from call function count seems to be the simplest fix and this patch just does that. This bug was found by LKP's cyclic performance regression tracking recently with the vm-scalability test suite. I have bisected to commit: 3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem") This commit didn't do anything wrong but revealed the irq_call_count problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file concurrent with multiple threads. When remap_one is try_to_unmap_one(), then multiple threads could queue flush TLB to the same CPU but only one IPI will be sent. Since the commit was added in Linux v3.19, the counting problem only shows up from v3.19 onwards. Signed-off-by: Aaron Lu Cc: Alex Shi Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Davidlohr Bueso Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Ying Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tomoki Sekiyama Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 4 ---- arch/x86/kernel/irq.c | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7178043b0e1d..59405a248fc2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,10 +22,6 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - /* - * irq_tlb_count is double-counted in irq_call_count, so it must be - * subtracted from irq_call_count when displaying irq_call_count - */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19c10..9f669fdd2010 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) -- cgit v1.2.3 From 007b756053386af079ba963a8f5817ac651c7c59 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:13 -0700 Subject: x86/boot: Run reserve_bios_regions() after we initialize the memory map reserve_bios_regions() is a quirk that reserves memory that we might otherwise think is available. There's no need to run it so early, and running it before we have the memory map initialized with its non-quirky inputs makes it hard to make reserve_bios_regions() more intelligent. Move it right after we populate the memblock state. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/59f58618911005c799c6c9979ce6ae4881d907c2.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head32.c | 2 -- arch/x86/kernel/head64.c | 1 - arch/x86/kernel/setup.c | 2 ++ 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2dda0bc4576e..f16c55bfc090 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 99d48e7d2974..54a2372f5dbb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95cf31c9f4ec..bf780e0f76ef 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1101,6 +1101,8 @@ void __init setup_arch(char **cmdline_p) efi_find_mirror(); } + reserve_bios_regions(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. -- cgit v1.2.3 From 18bc7bd523e0fc5be8d76bf84bde733a97a8c375 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:14 -0700 Subject: x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features directly The initialization process for trampoline_cr4_features and mmu_cr4_features was confusing. The intent is for mmu_cr4_features and *trampoline_cr4_features to stay in sync, but trampoline_cr4_features is NULL until setup_real_mode() runs. The old code synchronized *trampoline_cr4_features *twice*, once in setup_real_mode() and once in setup_arch(). It also initialized mmu_cr4_features in setup_real_mode(), which causes the actual value of mmu_cr4_features to potentially depend on when setup_real_mode() is called. With this patch, mmu_cr4_features is initialized directly in setup_arch(), and *trampoline_cr4_features is synchronized to mmu_cr4_features when the trampoline is set up. After this patch, it should be safe to defer setup_real_mode(). Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d48a263f9912389b957dd495a7127b009259ffe0.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 17 ++++++++++------- arch/x86/realmode/init.c | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bf780e0f76ef..95c8c9c7dd6d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1131,6 +1131,16 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + if (boot_cpu_data.cpuid_level >= 0) + /* A CPU has %cr4 if and only if it has CPUID. */ + mmu_cr4_features = __read_cr4(); + setup_real_mode(); memblock_set_current_limit(get_max_mapped()); @@ -1180,13 +1190,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 705e3fffb4a1..c5bdc4e473e7 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -4,6 +4,7 @@ #include #include #include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -84,7 +85,7 @@ void __init setup_real_mode(void) trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = __read_cr4(); + *trampoline_cr4_features = mmu_cr4_features; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; -- cgit v1.2.3 From d0de0f685db7faf2ae4597a39a59996dd84e18c7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:15 -0700 Subject: x86/boot: Defer setup_real_mode() to early_initcall time There's no need to run setup_real_mode() as early as we run it. Defer it to the same early_initcall that sets up the page permissions for the real mode code. This should be a code size reduction. More importantly, it give us a longer window in which we can allocate the real mode trampoline. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fd62f0da4f79357695e9bf3e365623736b05f119.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 1 - arch/x86/kernel/setup.c | 2 -- arch/x86/realmode/init.c | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9c6b890d5e7a..8d6777724ba4 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -59,6 +59,5 @@ extern unsigned char secondary_startup_64[]; #endif void reserve_real_mode(void); -void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95c8c9c7dd6d..0fa60f5f5a16 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1141,8 +1141,6 @@ void __init setup_arch(char **cmdline_p) /* A CPU has %cr4 if and only if it has CPUID. */ mmu_cr4_features = __read_cr4(); - setup_real_mode(); - memblock_set_current_limit(get_max_mapped()); /* diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index c5bdc4e473e7..747b71e8f547 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -30,7 +30,7 @@ void __init reserve_real_mode(void) base, (unsigned long long)mem, size); } -void __init setup_real_mode(void) +static void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; @@ -101,7 +101,7 @@ void __init setup_real_mode(void) * need to mark it executable at do_pre_smp_initcalls() at least, * thus run it as a early_initcall(). */ -static int __init set_real_mode_permissions(void) +static void __init set_real_mode_permissions(void) { unsigned char *base = (unsigned char *) real_mode_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); @@ -120,7 +120,16 @@ static int __init set_real_mode_permissions(void) set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); +} + +static int __init init_real_mode(void) +{ + if (!real_mode_header) + panic("Real mode trampoline was not allocated"); + + setup_real_mode(); + set_real_mode_permissions(); return 0; } -early_initcall(set_real_mode_permissions); +early_initcall(init_real_mode); -- cgit v1.2.3 From 5ff3e2c3c3eebe13967d81ad1f23b9468fefea81 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:16 -0700 Subject: x86/boot: Rework reserve_real_mode() to allow multiple tries If reserve_real_mode() fails, panicing immediately means we're doomed. Make it safe to try more than once to allocate the trampoline: - Degrade a failure from panic() to pr_info(). (If we make it to setup_real_mode() without reserving the trampoline, we'll panic them.) - Factor out helpers so that platform code can supply a specific address to try. - Warn if reserve_real_mode() is called after we're done with the memblock allocator. If that were to happen, we would behave unpredictably. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/876e383038f3e9971aa72fd20a4f5da05f9d193d.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/realmode.h | 9 +++++++++ arch/x86/realmode/init.c | 29 +++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 8d6777724ba4..b2988c0ed829 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,15 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif +static inline size_t real_mode_size_needed(void) +{ + if (real_mode_header) + return 0; /* already allocated. */ + + return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE); +} + +void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 747b71e8f547..5db706f14111 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -12,22 +13,34 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void __init set_real_mode_mem(phys_addr_t mem, size_t size) +{ + void *base = __va(mem); + + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + void __init reserve_real_mode(void) { phys_addr_t mem; - unsigned char *base; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + size_t size = real_mode_size_needed(); + + if (!size) + return; + + WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); + if (!mem) { + pr_info("No sub-1M memory is available for the trampoline\n"); + return; + } - base = __va(mem); memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + set_real_mode_mem(mem, size); } static void __init setup_real_mode(void) -- cgit v1.2.3 From 5bc653b7318217c54244a14f248f1f07abe0a865 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Aug 2016 02:29:17 -0700 Subject: x86/efi: Allocate a trampoline if needed in efi_free_boot_services() On my Dell XPS 13 9350 with firmware 1.4.4 and SGX on, if I boot Fedora 24's grub2-efi off a hard disk, my first 1MB of RAM looks like: efi: mem00: [Runtime Data |RUN| | | | | | | |WB|WT|WC|UC] range=[0x0000000000000000-0x0000000000000fff] (0MB) efi: mem01: [Boot Data | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000001000-0x0000000000027fff] (0MB) efi: mem02: [Loader Data | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000028000-0x0000000000029fff] (0MB) efi: mem03: [Reserved | | | | | | | | |WB|WT|WC|UC] range=[0x000000000002a000-0x000000000002bfff] (0MB) efi: mem04: [Runtime Data |RUN| | | | | | | |WB|WT|WC|UC] range=[0x000000000002c000-0x000000000002cfff] (0MB) efi: mem05: [Loader Data | | | | | | | | |WB|WT|WC|UC] range=[0x000000000002d000-0x000000000002dfff] (0MB) efi: mem06: [Conventional Memory| | | | | | | | |WB|WT|WC|UC] range=[0x000000000002e000-0x0000000000057fff] (0MB) efi: mem07: [Reserved | | | | | | | | |WB|WT|WC|UC] range=[0x0000000000058000-0x0000000000058fff] (0MB) efi: mem08: [Conventional Memory| | | | | | | | |WB|WT|WC|UC] range=[0x0000000000059000-0x000000000009ffff] (0MB) My EBDA is at 0x2c000, which blocks off everything from 0x2c000 and up, and my trampoline is 0x6000 bytes (6 pages), so it doesn't fit in the loader data range at 0x28000. Without this patch, it panics due to a failure to allocate the trampoline. With this patch, it works: [ +0.001744] Base memory trampoline at [ffff880000001000] 1000 size 24576 Signed-off-by: Andy Lutomirski Reviewed-by: Matt Fleming Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mario Limonciello Cc: Matt Fleming Cc: Matthew Garrett Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/998c77b3bf709f3dfed85cb30701ed1a5d8a438b.1470821230.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/quirks.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 4480c06cade7..89d1146f5a6f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -254,6 +254,7 @@ void __init efi_free_boot_services(void) for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) @@ -263,6 +264,26 @@ void __init efi_free_boot_services(void) if (md->attribute & EFI_MEMORY_RUNTIME) continue; + /* + * Nasty quirk: if all sub-1MB memory is used for boot + * services, we can get here without having allocated the + * real mode trampoline. It's too late to hand boot services + * memory back to the memblock allocator, so instead + * try to manually allocate the trampoline if needed. + * + * I've seen this on a Dell XPS 13 9350 with firmware + * 1.4.4 with SGX enabled booting Linux via Fedora 24's + * grub2-efi on a hard disk. (And no, I don't know why + * this happened, but Linux should still try to boot rather + * panicing early.) + */ + rm_size = real_mode_size_needed(); + if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { + set_real_mode_mem(start, rm_size); + start += rm_size; + size -= rm_size; + } + free_bootmem_late(start, size); } -- cgit v1.2.3 From f72075c9eda8a43aeea2f9dbb8d187afd4a76f0b Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Thu, 11 Aug 2016 11:41:59 +0100 Subject: x86/platform/uv: Skip UV runtime services mapping in the efi_runtime_disabled case This problem has actually been in the UV code for a while, but we didn't catch it until recently, because we had been relying on EFI_OLD_MEMMAP to allow our systems to boot for a period of time. We noticed the issue when trying to kexec a recent community kernel, where we hit this NULL pointer dereference in efi_sync_low_kernel_mappings(): [ 0.337515] BUG: unable to handle kernel NULL pointer dereference at 0000000000000880 [ 0.346276] IP: [] efi_sync_low_kernel_mappings+0x5d/0x1b0 The problem doesn't show up with EFI_OLD_MEMMAP because we skip the chunk of setup_efi_state() that sets the efi_loader_signature for the kexec'd kernel. When the kexec'd kernel boots, it won't set EFI_BOOT in setup_arch, so we completely avoid the bug. We always kexec with noefi on the command line, so this shouldn't be an issue, but since we're not actually checking for efi_runtime_disabled in uv_bios_init(), we end up trying to do EFI runtime callbacks when we shouldn't be. This patch just adds a check for efi_runtime_disabled in uv_bios_init() so that we don't map in uv_systab when runtime_disabled == true. Signed-off-by: Alex Thorlton Signed-off-by: Matt Fleming Cc: # v4.7 Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mike Travis Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/1470912120-22831-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/bios_uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..0df8a0370d32 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -187,7 +187,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + !efi.uv_systab || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } -- cgit v1.2.3 From d52c0569bab4edc888832df44dc7ac28517134f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 11 Aug 2016 16:08:35 +0200 Subject: x86/apic/x2apic, smp/hotplug: Don't use before alloc in x2apic_cluster_probe() I made a mistake while converting the driver to the hotplug state machine and as a result x2apic_cluster_probe() was accessing cpus_in_cluster before allocating it. This patch fixes it by setting the cpumask after the allocation the memory succeeded. While at it, I marked two functions static which are only used within this file. Reported-by: Laura Abbott Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6b2c28471de5 ("x86/x2apic: Convert to CPU hotplug state machine") Link: http://lkml.kernel.org/r/1470924515-9444-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6368fa69d2af..54f35d988025 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -155,7 +155,7 @@ static void init_x2apic_ldr(void) /* * At CPU state changes, update the x2apic cluster sibling info. */ -int x2apic_prepare_cpu(unsigned int cpu) +static int x2apic_prepare_cpu(unsigned int cpu) { if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) return -ENOMEM; @@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu) return 0; } -int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_dead_cpu(unsigned int this_cpu) { int cpu; @@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu) static int x2apic_cluster_probe(void) { int cpu = smp_processor_id(); + int ret; if (!x2apic_mode) return 0; + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); + return 0; + } cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", - x2apic_prepare_cpu, x2apic_dead_cpu); return 1; } -- cgit v1.2.3 From 68187872c76a96ed4db7bfb064272591f02e208b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 11 Aug 2016 17:45:21 +0200 Subject: uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions Since instruction decoder now supports EVEX-encoded instructions, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by the CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex.vvvv for EVEX too. This is now super easy because instruction decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Acked-by: Srikar Dronamraju Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Cc: # v4.1+ Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX") Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31d99ff..495c776de4b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. -- cgit v1.2.3 From 10e9e7bd598f9a66a11a22514c68c13c41fc821b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:30:20 -0700 Subject: perf/x86/intel/uncore: Fix uncore num_counters Some uncore boxes' num_counters value for Haswell server and Broadwell server are not correct (too large, off by one). This issue was found by comparing the code with the document. Although there is no bug report from users yet, accessing non-existent counters is dangerous and the behavior is undefined: it may cause miscounting or even crashes. This patch makes them consistent with the uncore document. Reported-by: Lukasz Odzioba Signed-off-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Link: http://lkml.kernel.org/r/1470925820-59847-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 824e54086e07..8aee83bcf71f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void) static struct intel_uncore_type hswep_uncore_ha = { .name = "ha", - .num_counters = 5, + .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), @@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = { static struct intel_uncore_type hswep_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, @@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = { static struct intel_uncore_type hswep_uncore_qpi = { .name = "qpi", - .num_counters = 5, + .num_counters = 4, .num_boxes = 3, .perf_ctr_bits = 48, .perf_ctr = SNBEP_PCI_PMON_CTR0, @@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = { static struct intel_uncore_type hswep_uncore_r3qpi = { .name = "r3qpi", - .num_counters = 4, + .num_counters = 3, .num_boxes = 3, .perf_ctr_bits = 44, .constraints = hswep_uncore_r3qpi_constraints, @@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, -- cgit v1.2.3 From 95f3be798472f63b495ca4712af005ea5ac7aa47 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:31:14 -0700 Subject: perf/x86/intel/uncore: Add enable_box for client MSR uncore There are bug reports about miscounting uncore counters on some client machines like Sandybridge, Broadwell and Skylake. It is very likely to be observed on idle systems. This issue is caused by a hardware issue. PERF_GLOBAL_CTL could be cleared after Package C7, and nothing will be count. The related errata (HSD 158) could be found in: www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-desktop-specification-update.pdf This patch tries to work around this issue by re-enabling PERF_GLOBAL_CTL in ->enable_box(). The workaround does not cover all cases. It helps for new events after returning from C7. But it cannot prevent C7, it will still miscount if a counter is already active. There is no drawback in leaving it enabled, so it does not need disable_box() here. Signed-off-by: Kan Liang Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470925874-59943-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snb.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 97a69dbba649..9d35ec0cb8fc 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +} + static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .enable_box = snb_uncore_msr_enable_box, .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, @@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); +} + static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) static struct intel_uncore_ops skl_uncore_msr_ops = { .init_box = skl_uncore_msr_init_box, + .enable_box = skl_uncore_msr_enable_box, .exit_box = skl_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, -- cgit v1.2.3 From 5d87f493ddb1b86a0569fa3c4037fa9efc0c7183 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 14 Aug 2016 04:07:32 +0200 Subject: x86/power/64: Use __pa() for physical address computation The value of temp_level4_pgt is the physical address of the top-level page directory, so use __pa() to compute it. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- arch/x86/power/hibernate_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a3e3ccc87138..9634557a5444 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -113,7 +113,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET; + temp_level4_pgt = __pa(pgd); return 0; } -- cgit v1.2.3 From 172b1d6b5a9337eb8c1ec294b80e448e03a9ac17 Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Fri, 12 Aug 2016 06:24:42 -0400 Subject: crypto: sha256-mb - fix ctx pointer and digest copy 1. fix ctx pointer Use req_ctx which is the ctx for the next job that have been completed in the lanes instead of the first completed job rctx, whose completion could have been called and released. 2. fix digest copy Use XMM register to copy another 16 bytes sha256 digest instead of a regular register. Signed-off-by: Xiaodong Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-mb/sha256_mb.c | 4 ++-- arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 89fa85e8b10c..6f97fb33ae21 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -485,10 +485,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index b691da981cd9..a78a0694ddef 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -265,13 +265,14 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 - movl _args_digest+4*32(state, idx, 4), tmp2_w + vmovd _args_digest(state , idx, 4) , %xmm0 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 - vmovdqu %xmm0, _result_digest(job_rax) - movl tmp2_w, _result_digest+1*16(job_rax) + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + vmovdqu %xmm1, offset(job_rax) pop %rbx -- cgit v1.2.3 From e67479b13ede47cc2f5beb5b51e67fdb30778ee8 Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Fri, 12 Aug 2016 06:28:31 -0400 Subject: crypto: sha512-mb - fix ctx pointer 1. fix ctx pointer Use req_ctx which is the ctx for the next job that have been completed in the lanes instead of the first completed job rctx, whose completion could have been called and released. Signed-off-by: Xiaodong Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-mb/sha512_mb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index f4cf5b78fd36..d210174a52b0 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -497,10 +497,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } -- cgit v1.2.3 From 88b2f634028f1f38dcc3d412e10ff1f224976daa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 17 Aug 2016 13:33:14 +0200 Subject: x86/microcode/AMD: Fix initrd loading with CONFIG_RANDOMIZE_MEMORY=y Similar to: efaad554b4ff ("x86/microcode/intel: Fix initrd loading with CONFIG_RANDOMIZE_MEMORY=y") ... fix microcode loading from the initrd on AMD by adding the randomization offset to the microcode patch container within the initrd. Reported-and-tested-by: Brian Gerst Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-commits@vger.kernel.org Link: http://lkml.kernel.org/r/20160817113314.GA19221@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 27a0228c9cae..b816971f5da4 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -355,6 +355,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -371,8 +372,11 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -434,6 +438,9 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -- cgit v1.2.3 From 7b0501b1e7cddd32b265178e32d332bdfbb532d4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 15 Aug 2016 12:17:00 +0200 Subject: x86/smp: Fix __max_logical_packages value setup Frank reported kernel panic when he disabled several cores in BIOS via following option: Core Disable Bitmap(Hex) [0] with number 0xFFE, which leaves 16 CPUs in system (out of 48). The kernel panic below goes along with following messages: smpboot: Max logical packages: 2^M smpboot: APIC(0) Converting physical 0 to logical package 0^M smpboot: APIC(20) Converting physical 1 to logical package 1^M smpboot: APIC(40) Package 2 exceeds logical package map^M smpboot: CPU 8 APICId 40 disabled^M smpboot: APIC(60) Package 3 exceeds logical package map^M smpboot: CPU 12 APICId 60 disabled^M ... general protection fault: 0000 [#1] SMP^M Modules linked in:^M CPU: 15 PID: 1 Comm: swapper/0 Not tainted 4.7.0-rc5+ #1^M Hardware name: SGI UV300/UV300, BIOS SGI UV 300 series BIOS 05/25/2016^M task: ffff8801673e0000 ti: ffff8801673ac000 task.ti: ffff8801673ac000^M RIP: 0010:[] [] uncore_change_context+0xd4/0x180^M ... [] uncore_event_init_cpu+0x6c/0x70^M [] intel_uncore_init+0x1c2/0x2dd^M [] ? uncore_cpu_setup+0x17/0x17^M [] do_one_initcall+0x50/0x190^M [] ? parse_args+0x293/0x480^M [] kernel_init_freeable+0x1a5/0x249^M [] ? set_debug_rodata+0x12/0x12^M [] kernel_init+0xe/0x110^M [] ret_from_fork+0x1f/0x40^M [] ? rest_init+0x80/0x80^M The reason for the panic is wrong value of __max_logical_packages, which lets logical_package_map uninitialized and the uncore code relying on this map being properly initialized (maybe we should add some safety checks there as well). The __max_logical_packages is computed as: DIV_ROUND_UP(total_cpus, ncpus); - ncpus being number of cores With above BIOS setup we get total_cpus == 16 which set __max_logical_packages to 2 (ncpus is 12). Once topology_update_package_map processes CPU with logical pkg over 2 we display above messages and fail to initialize the physical_to_logical_pkg map, which makes the uncore code crash. The fix is to remove logical_package_map bitmap completely and keep and update the logical_packages number instead. After we enumerate all the present CPUs, we check if the enumerated logical packages count is within its computed maximum from BIOS data. If it's not the case, we set this maximum to the new enumerated value and freeze any new addition of logical packages. The freeze is because lot of init code like uncore/rapl/cqm depends on having maximum logical package value set to allocate their data, so we can't change it later on. Prarit Bhargava tested the patch and confirms that it solves the problem: From dmidecode: Core Count: 24 Core Enabled: 24 Thread Count: 48 Orig kernel boot log: [ 0.464981] smpboot: Max logical packages: 19 [ 0.469861] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.477261] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.484760] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.492258] smpboot: APIC(c0) Converting physical 3 to logical package 3 1. nr_cpus=8, should stop enumerating in package 0: [ 0.533664] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.539596] smpboot: Max logical packages: 19 2. max_cpus=8, should still enumerate all packages: [ 0.526494] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.532428] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.538456] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.544486] smpboot: APIC(c0) Converting physical 3 to logical package 3 [ 0.550524] smpboot: Max logical packages: 19 3. nr_cpus=49 ( 2 socket + 1 core on 3rd socket), should stop enumerating in package 2: [ 0.521378] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.527314] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.533345] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.539368] smpboot: Max logical packages: 19 4. maxcpus=49, should still enumerate all packages: [ 0.525591] smpboot: APIC(0) Converting physical 0 to logical package 0 [ 0.531525] smpboot: APIC(40) Converting physical 1 to logical package 1 [ 0.537547] smpboot: APIC(80) Converting physical 2 to logical package 2 [ 0.543579] smpboot: APIC(c0) Converting physical 3 to logical package 3 [ 0.549624] smpboot: Max logical packages: 19 5. kdump (nr_cpus=1) works as well. Reported-by: Frank Ramsay Tested-by: Prarit Bhargava Signed-off-by: Jiri Olsa Reviewed-by: Prarit Bhargava Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160815101700.GA30090@krava Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a6e84a30a54..4296beb8fdd3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -341,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -352,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -369,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) -- cgit v1.2.3 From d048c098218e91ed0e10dfa1f0f80e2567fe4ef7 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Mon, 8 Aug 2016 20:16:22 +0200 Subject: KVM: nVMX: fix msr bitmaps to prevent L2 from accessing L0 x2APIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit msr bitmap can be used to avoid a VM exit (interception) on guest MSR accesses. In some configurations of VMX controls, the guest can even directly access host's x2APIC MSRs. See SDM 29.5 VIRTUALIZING MSR-BASED APIC ACCESSES. L2 could read all L0's x2APIC MSRs and write TPR, EOI, and SELF_IPI. To do so, L1 would first trick KVM to disable all possible interceptions by enabling APICv features and then would turn those features off; nested_vmx_merge_msr_bitmap() only disabled interceptions, so VMX would not intercept previously enabled MSRs even though they were not safe with the new configuration. Correctly re-enabling interceptions is not enough as a second bug would still allow L1+L2 to access host's MSRs: msr bitmap was shared for all VMCSs, so L1 could trigger a race to get the desired combination of msr bitmap and VMX controls. This fix allocates a msr bitmap for every L1 VCPU, allows only safe x2APIC MSRs from L1's msr bitmap, and disables msr bitmaps if they would have to intercept everything anyway. Fixes: 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap") Reported-by: Jim Mattson Suggested-by: Wincy Van Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 107 ++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d8580f91e..c66ac2c70d22 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -435,6 +435,8 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; + unsigned long *msr_bitmap; + struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -924,7 +926,6 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2508,7 +2509,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { @@ -6363,13 +6364,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6392,8 +6386,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6529,9 +6521,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6557,8 +6546,6 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6995,16 +6982,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) - return -ENOMEM; + goto out_cached_vmcs12; if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) { - kfree(vmx->nested.cached_vmcs12); - return -ENOMEM; - } + if (!shadow_vmcs) + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7024,6 +7016,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -7098,6 +7099,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); kfree(vmx->nested.cached_vmcs12); @@ -9472,8 +9477,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9482,63 +9489,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); - if (!msr_bitmap) { + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (!msr_bitmap_l1) { nested_release_page_clean(page); WARN_ON(1); return false; } + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9957,10 +9938,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /* -- cgit v1.2.3 From dccbfcf52cebb8963246eba5b177b77f26b34da0 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Mon, 8 Aug 2016 20:16:23 +0200 Subject: KVM: nVMX: postpone VMCS changes on MSR_IA32_APICBASE write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If vmcs12 does not intercept APIC_BASE writes, then KVM will handle the write with vmcs02 as the current VMCS. This will incorrectly apply modifications intended for vmcs01 to vmcs02 and L2 can use it to gain access to L0's x2APIC registers by disabling virtualized x2APIC while using msr bitmap that assumes enabled. Postpone execution of vmx_set_virtual_x2apic_mode until vmcs01 is the current VMCS. An alternative solution would temporarily make vmcs01 the current VMCS, but it requires more care. Fixes: 8d14695f9542 ("x86, apicv: add virtual x2apic support") Reported-by: Jim Mattson Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c66ac2c70d22..ae111a07acc4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -422,6 +422,7 @@ struct nested_vmx { struct list_head vmcs02_pool; int vmcs02_num; u64 vmcs01_tsc_offset; + bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; /* @@ -8424,6 +8425,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) { u32 sec_exec_control; + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true; + return; + } + /* * There is not point to enable virtualize x2apic without enable * apicv @@ -10749,6 +10756,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); + if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { + vmx->nested.change_vmcs01_virtual_x2apic_mode = false; + vmx_set_virtual_x2apic_mode(vcpu, + vcpu->arch.apic_base & X2APIC_ENABLE); + } + /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; -- cgit v1.2.3 From c95ba92afb238ac565c68968fc72e38ca8d1b6e8 Mon Sep 17 00:00:00 2001 From: Peter Feiner Date: Wed, 17 Aug 2016 09:36:47 -0700 Subject: kvm: nVMX: fix nested tsc scaling When the host supported TSC scaling, L2 would use a TSC multiplier of 0, which causes a VM entry failure. Now L2's TSC uses the same multiplier as L1. Signed-off-by: Peter Feiner Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae111a07acc4..5cede40e2552 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2200,6 +2200,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); } +static void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2258,10 +2264,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Setup TSC multiplier */ if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { - vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); - } + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) + decache_tsc_multiplier(vmx); vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); @@ -9999,6 +10003,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); else vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (enable_vpid) { /* @@ -10755,6 +10761,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, else vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { vmx->nested.change_vmcs01_virtual_x2apic_mode = false; -- cgit v1.2.3 From 21c80c9fefc3db10b530a96eb0478c29eb28bf77 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 23 Aug 2016 16:36:42 -0500 Subject: x86/PCI: VMD: Fix infinite loop executing irq's We can't initialize the list head on deletion as this causes the node to point to itself, which causes an infinite loop if vmd_irq() happens to be servicing that node. The list initialization was trying to fix a bug from multiple calls to disable the same IRQ. Fix this instead by having the VMD driver track if the interrupt is enabled. [bhelgaas: changelog, add "Fixes"] Fixes: 97e923063575 ("x86/PCI: VMD: Initialize list item in IRQ disable") Reported-by: Grzegorz Koczot Tested-by: Miroslaw Drost Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Acked-by Jon Derrick: --- arch/x86/pci/vmd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index b814ca675131..7948be342ee9 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -41,6 +41,7 @@ static DEFINE_RAW_SPINLOCK(list_lock); * @node: list item for parent traversal. * @rcu: RCU callback item for freeing. * @irq: back pointer to parent. + * @enabled: true if driver enabled IRQ * @virq: the virtual IRQ value provided to the requesting driver. * * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to @@ -50,6 +51,7 @@ struct vmd_irq { struct list_head node; struct rcu_head rcu; struct vmd_irq_list *irq; + bool enabled; unsigned int virq; }; @@ -122,7 +124,9 @@ static void vmd_irq_enable(struct irq_data *data) unsigned long flags; raw_spin_lock_irqsave(&list_lock, flags); + WARN_ON(vmdirq->enabled); list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); + vmdirq->enabled = true; raw_spin_unlock_irqrestore(&list_lock, flags); data->chip->irq_unmask(data); @@ -136,8 +140,10 @@ static void vmd_irq_disable(struct irq_data *data) data->chip->irq_mask(data); raw_spin_lock_irqsave(&list_lock, flags); - list_del_rcu(&vmdirq->node); - INIT_LIST_HEAD_RCU(&vmdirq->node); + if (vmdirq->enabled) { + list_del_rcu(&vmdirq->node); + vmdirq->enabled = false; + } raw_spin_unlock_irqrestore(&list_lock, flags); } -- cgit v1.2.3 From 2e63ad4bd5dd583871e6602f9d398b9322d358d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 23 Aug 2016 20:07:19 +0800 Subject: x86/apic: Do not init irq remapping if ioapic is disabled native_smp_prepare_cpus -> default_setup_apic_routing -> enable_IR_x2apic -> irq_remapping_prepare -> intel_prepare_irq_remapping -> intel_setup_irq_remapping So IR table is setup even if "noapic" boot parameter is added. As a result we crash later when the interrupt affinity is set due to a half initialized remapping infrastructure. Prevent remap initialization when IOAPIC is disabled. Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1471954039-3942-1-git-send-email-wanpeng.li@hotmail.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index cea4fc19e844..50c95af0f017 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1623,6 +1623,9 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; + if (skip_ioapic_setup) + return; + ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) return; -- cgit v1.2.3 From 55467dea2967259f21f4f854fc99d39cc5fea60e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 29 Jul 2016 11:06:48 +0200 Subject: xen: change the type of xen_vcpu_id to uint32_t We pass xen_vcpu_id mapping information to hypercalls which require uint32_t type so it would be cleaner to have it as uint32_t. The initializer to -1 can be dropped as we always do the mapping before using it and we never check the 'not set' value anyway. Signed-off-by: Vitaly Kuznetsov Signed-off-by: David Vrabel --- arch/arm/xen/enlighten.c | 2 +- arch/x86/xen/enlighten.c | 2 +- include/xen/xen-ops.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index b0b82f5ea338..3d2cef6488ea 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -50,7 +50,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); static struct vcpu_info __percpu *xen_vcpu_info; /* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); /* These are unused until we support booting "pre-ballooned" */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ffb089b19a5..b86ebb1a9a7f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -118,7 +118,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); enum xen_domain_type xen_domain_type = XEN_NATIVE; diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 9a37c541822f..b5486e648607 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -9,8 +9,8 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); -DECLARE_PER_CPU(int, xen_vcpu_id); -static inline int xen_vcpu_nr(int cpu) +DECLARE_PER_CPU(uint32_t, xen_vcpu_id); +static inline uint32_t xen_vcpu_nr(int cpu) { return per_cpu(xen_vcpu_id, cpu); } -- cgit v1.2.3 From a5ff1b34e16c203397542d98c49c5c7783193946 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 25 Aug 2016 15:17:02 -0700 Subject: treewide: replace config_enabled() with IS_ENABLED() (2nd round) Commit 97f2645f358b ("tree-wide: replace config_enabled() with IS_ENABLED()") mostly killed config_enabled(), but some new users have appeared for v4.8-rc1. They are all used for a boolean option, so can be replaced with IS_ENABLED() safely. Link: http://lkml.kernel.org/r/1471970749-24867-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Acked-by: Peter Oberparleiter Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ralf Baechle Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/page.h | 4 ++-- arch/s390/kernel/setup.c | 6 ++---- arch/x86/mm/kaslr.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index ea0cd9773914..5f987598054f 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -164,7 +164,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; */ static inline unsigned long ___pa(unsigned long x) { - if (config_enabled(CONFIG_64BIT)) { + if (IS_ENABLED(CONFIG_64BIT)) { /* * For MIPS64 the virtual address may either be in one of * the compatibility segements ckseg0 or ckseg1, or it may @@ -173,7 +173,7 @@ static inline unsigned long ___pa(unsigned long x) return x < CKSEG0 ? XPHYSADDR(x) : CPHYSADDR(x); } - if (!config_enabled(CONFIG_EVA)) { + if (!IS_ENABLED(CONFIG_EVA)) { /* * We're using the standard MIPS32 legacy memory map, ie. * the address x is going to be in kseg0 or kseg1. We can diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ba5f456edaa9..7f7ba5f23f13 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -204,11 +204,9 @@ static void __init conmode_default(void) #endif } } else if (MACHINE_IS_KVM) { - if (sclp.has_vt220 && - config_enabled(CONFIG_SCLP_VT220_CONSOLE)) + if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) SET_CONSOLE_VT220; - else if (sclp.has_linemode && - config_enabled(CONFIG_SCLP_CONSOLE)) + else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) SET_CONSOLE_SCLP; else SET_CONSOLE_HVC; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ec8654f117d8..bda8d5eef04d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -77,7 +77,7 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region) */ static inline bool kaslr_memory_enabled(void) { - return kaslr_enabled() && !config_enabled(CONFIG_KASAN); + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } /* Initialize base and padding for each memory region randomized with KASLR */ -- cgit v1.2.3 From 0d025d271e55f3de21f0aaaf54b42d20404d2b23 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 30 Aug 2016 08:04:16 -0500 Subject: mm/usercopy: get rid of CONFIG_DEBUG_STRICT_USER_COPY_CHECKS There are three usercopy warnings which are currently being silenced for gcc 4.6 and newer: 1) "copy_from_user() buffer size is too small" compile warning/error This is a static warning which happens when object size and copy size are both const, and copy size > object size. I didn't see any false positives for this one. So the function warning attribute seems to be working fine here. Note this scenario is always a bug and so I think it should be changed to *always* be an error, regardless of CONFIG_DEBUG_STRICT_USER_COPY_CHECKS. 2) "copy_from_user() buffer size is not provably correct" compile warning This is another static warning which happens when I enable __compiletime_object_size() for new compilers (and CONFIG_DEBUG_STRICT_USER_COPY_CHECKS). It happens when object size is const, but copy size is *not*. In this case there's no way to compare the two at build time, so it gives the warning. (Note the warning is a byproduct of the fact that gcc has no way of knowing whether the overflow function will be called, so the call isn't dead code and the warning attribute is activated.) So this warning seems to only indicate "this is an unusual pattern, maybe you should check it out" rather than "this is a bug". I get 102(!) of these warnings with allyesconfig and the __compiletime_object_size() gcc check removed. I don't know if there are any real bugs hiding in there, but from looking at a small sample, I didn't see any. According to Kees, it does sometimes find real bugs. But the false positive rate seems high. 3) "Buffer overflow detected" runtime warning This is a runtime warning where object size is const, and copy size > object size. All three warnings (both static and runtime) were completely disabled for gcc 4.6 with the following commit: 2fb0815c9ee6 ("gcc4: disable __compiletime_object_size for GCC 4.6+") That commit mistakenly assumed that the false positives were caused by a gcc bug in __compiletime_object_size(). But in fact, __compiletime_object_size() seems to be working fine. The false positives were instead triggered by #2 above. (Though I don't have an explanation for why the warnings supposedly only started showing up in gcc 4.6.) So remove warning #2 to get rid of all the false positives, and re-enable warnings #1 and #3 by reverting the above commit. Furthermore, since #1 is a real bug which is detected at compile time, upgrade it to always be an error. Having done all that, CONFIG_DEBUG_STRICT_USER_COPY_CHECKS is no longer needed. Signed-off-by: Josh Poimboeuf Cc: Kees Cook Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Brian Gerst Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Byungchul Park Cc: Nilay Vaish Signed-off-by: Linus Torvalds --- arch/parisc/Kconfig | 1 - arch/parisc/configs/c8000_defconfig | 1 - arch/parisc/configs/generic-64bit_defconfig | 1 - arch/parisc/include/asm/uaccess.h | 22 ++++----- arch/s390/Kconfig | 1 - arch/s390/configs/default_defconfig | 1 - arch/s390/configs/gcov_defconfig | 1 - arch/s390/configs/performance_defconfig | 1 - arch/s390/defconfig | 1 - arch/s390/include/asm/uaccess.h | 19 +++++--- arch/tile/Kconfig | 1 - arch/tile/include/asm/uaccess.h | 22 +++++---- arch/x86/Kconfig | 1 - arch/x86/include/asm/uaccess.h | 69 ++++------------------------- include/asm-generic/uaccess.h | 1 + include/linux/compiler-gcc.h | 2 +- lib/Kconfig.debug | 18 -------- lib/Makefile | 1 - lib/usercopy.c | 9 ---- 19 files changed, 45 insertions(+), 128 deletions(-) delete mode 100644 lib/usercopy.c (limited to 'arch/x86') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index cd8778103165..af12c2db9bb8 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -1,6 +1,5 @@ config PARISC def_bool y - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_MIGHT_HAVE_PC_PARPORT select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/parisc/configs/c8000_defconfig b/arch/parisc/configs/c8000_defconfig index 1a8f6f95689e..f6a4c016304b 100644 --- a/arch/parisc/configs/c8000_defconfig +++ b/arch/parisc/configs/c8000_defconfig @@ -245,7 +245,6 @@ CONFIG_DEBUG_RT_MUTEXES=y CONFIG_PROVE_RCU_DELAY=y CONFIG_DEBUG_BLOCK_EXT_DEVT=y CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_KEYS=y # CONFIG_CRYPTO_HW is not set CONFIG_FONTS=y diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 7e0792658952..c564e6e1fa23 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -291,7 +291,6 @@ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y # CONFIG_SCHED_DEBUG is not set CONFIG_TIMER_STATS=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 0f59fd9ca205..e9150487e20d 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -208,13 +208,13 @@ unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned lo #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user -extern void copy_from_user_overflow(void) -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS - __compiletime_error("copy_from_user() buffer size is not provably correct") -#else - __compiletime_warning("copy_from_user() buffer size is not provably correct") -#endif -; +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); + +static inline void copy_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, @@ -223,10 +223,12 @@ static inline unsigned long __must_check copy_from_user(void *to, int sz = __compiletime_object_size(to); int ret = -EFAULT; - if (likely(sz == -1 || !__builtin_constant_p(n) || sz >= n)) + if (likely(sz == -1 || sz >= n)) ret = __copy_from_user(to, from, n); - else - copy_from_user_overflow(); + else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); + else + __bad_copy_user(); return ret; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e751fe25d6ab..c109f073d454 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,7 +68,6 @@ config DEBUG_RODATA config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 26e0c7f08814..412b1bd21029 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -602,7 +602,6 @@ CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_IRQSOFF_TRACER=y CONFIG_PREEMPT_TRACER=y CONFIG_SCHED_TRACER=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 24879dab47bc..bec279eb4b93 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -552,7 +552,6 @@ CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_CPU_NOTIFIER_ERROR_INJECT=m CONFIG_PM_NOTIFIER_ERROR_INJECT=m CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_KPROBE_EVENT is not set CONFIG_TRACE_ENUM_MAP_FILE=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index a5c1e5f2a0ca..1751446a5bbb 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -549,7 +549,6 @@ CONFIG_TIMER_STATS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 73610f2e3b4f..2d40ef0a6295 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -172,7 +172,6 @@ CONFIG_DEBUG_NOTIFIERS=y CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_RCU_TRACE=y CONFIG_LATENCYTOP=y -CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9b49cf1daa8f..95aefdba4be2 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -311,6 +311,14 @@ int __get_user_bad(void) __attribute__((noreturn)); #define __put_user_unaligned __put_user #define __get_user_unaligned __get_user +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); + +static inline void copy_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} + /** * copy_to_user: - Copy a block of data into user space. * @to: Destination address, in user space. @@ -332,12 +340,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) return __copy_to_user(to, from, n); } -void copy_from_user_overflow(void) -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS -__compiletime_warning("copy_from_user() buffer size is not provably correct") -#endif -; - /** * copy_from_user: - Copy a block of data from user space. * @to: Destination address, in kernel space. @@ -362,7 +364,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); if (unlikely(sz != -1 && sz < n)) { - copy_from_user_overflow(); + if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); + else + __bad_copy_user(); return n; } return __copy_from_user(to, from, n); diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 4820a02838ac..78da75b670bc 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -4,7 +4,6 @@ config TILE def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_WANT_FRAME_POINTERS diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h index 0a9c4265763b..a77369e91e54 100644 --- a/arch/tile/include/asm/uaccess.h +++ b/arch/tile/include/asm/uaccess.h @@ -416,14 +416,13 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS -/* - * There are still unprovable places in the generic code as of 2.6.34, so this - * option is not really compatible with -Werror, which is more useful in - * general. - */ -extern void copy_from_user_overflow(void) - __compiletime_warning("copy_from_user() size is not provably correct"); +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); + +static inline void copy_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, @@ -433,14 +432,13 @@ static inline unsigned long __must_check copy_from_user(void *to, if (likely(sz == -1 || sz >= n)) n = _copy_from_user(to, from, n); + else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); else - copy_from_user_overflow(); + __bad_copy_user(); return n; } -#else -#define copy_from_user _copy_from_user -#endif #ifdef __tilegx__ /** diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c580d8c33562..2a1f0ce7c59a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,6 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a0ae610b9280..c3f291195294 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -697,43 +697,14 @@ unsigned long __must_check _copy_from_user(void *to, const void __user *from, unsigned long __must_check _copy_to_user(void __user *to, const void *from, unsigned n); -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS -# define copy_user_diag __compiletime_error -#else -# define copy_user_diag __compiletime_warning -#endif - -extern void copy_user_diag("copy_from_user() buffer size is too small") -copy_from_user_overflow(void); -extern void copy_user_diag("copy_to_user() buffer size is too small") -copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); - -#undef copy_user_diag - -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS - -extern void -__compiletime_warning("copy_from_user() buffer size is not provably correct") -__copy_from_user_overflow(void) __asm__("copy_from_user_overflow"); -#define __copy_from_user_overflow(size, count) __copy_from_user_overflow() - -extern void -__compiletime_warning("copy_to_user() buffer size is not provably correct") -__copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); -#define __copy_to_user_overflow(size, count) __copy_to_user_overflow() - -#else +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); -static inline void -__copy_from_user_overflow(int size, unsigned long count) +static inline void copy_user_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -#define __copy_to_user_overflow __copy_from_user_overflow - -#endif - static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { @@ -743,31 +714,13 @@ copy_from_user(void *to, const void __user *from, unsigned long n) kasan_check_write(to, n); - /* - * While we would like to have the compiler do the checking for us - * even in the non-constant size case, any false positives there are - * a problem (especially when DEBUG_STRICT_USER_COPY_CHECKS, but even - * without - the [hopefully] dangerous looking nature of the warning - * would make people go look at the respecitive call sites over and - * over again just to find that there's no problem). - * - * And there are cases where it's just not realistic for the compiler - * to prove the count to be in range. For example when multiple call - * sites of a helper function - perhaps in different source files - - * all doing proper range checking, yet the helper function not doing - * so again. - * - * Therefore limit the compile time checking to the constant size - * case, and do only runtime checking for non-constant sizes. - */ - if (likely(sz < 0 || sz >= n)) { check_object_size(to, n, false); n = _copy_from_user(to, from, n); - } else if (__builtin_constant_p(n)) - copy_from_user_overflow(); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); else - __copy_from_user_overflow(sz, n); + __bad_copy_user(); return n; } @@ -781,21 +734,17 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); - /* See the comment in copy_from_user() above. */ if (likely(sz < 0 || sz >= n)) { check_object_size(from, n, true); n = _copy_to_user(to, from, n); - } else if (__builtin_constant_p(n)) - copy_to_user_overflow(); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); else - __copy_to_user_overflow(sz, n); + __bad_copy_user(); return n; } -#undef __copy_from_user_overflow -#undef __copy_to_user_overflow - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index 1bfa602958f2..5dea1fb6979c 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -72,6 +72,7 @@ struct exception_table_entry /* Returns 0 if exception not found and fixup otherwise. */ extern unsigned long search_exception_table(unsigned long); + /* * architectures with an MMU should override these two */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 8dbc8929a6a0..573c5a18908f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -158,7 +158,7 @@ #define __compiler_offsetof(a, b) \ __builtin_offsetof(a, b) -#if GCC_VERSION >= 40100 && GCC_VERSION < 40600 +#if GCC_VERSION >= 40100 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0) #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2307d7c89dac..2e2cca509231 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1686,24 +1686,6 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. -config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - bool - -config DEBUG_STRICT_USER_COPY_CHECKS - bool "Strict user copy size checks" - depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS - depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING - help - Enabling this option turns a certain set of sanity checks for user - copy operations into compile time failures. - - The copy_from_user() etc checks are there to help test if there - are sufficient security checks on the length argument of - the copy operation, by having gcc prove that the argument is - within bounds. - - If unsure, say N. - source kernel/trace/Kconfig menu "Runtime Testing" diff --git a/lib/Makefile b/lib/Makefile index cfa68eb269e4..5dc77a8ec297 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -24,7 +24,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o -obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o lib-$(CONFIG_HAS_DMA) += dma-noop.o diff --git a/lib/usercopy.c b/lib/usercopy.c deleted file mode 100644 index 4f5b1ddbcd25..000000000000 --- a/lib/usercopy.c +++ /dev/null @@ -1,9 +0,0 @@ -#include -#include -#include - -void copy_from_user_overflow(void) -{ - WARN(1, "Buffer overflow detected!\n"); -} -EXPORT_SYMBOL(copy_from_user_overflow); -- cgit v1.2.3 From 6af7e4f77259ee946103387372cb159f2e99a6d4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 1 Sep 2016 08:52:29 -0500 Subject: PCI: Mark Haswell Power Control Unit as having non-compliant BARs The Haswell Power Control Unit has a non-PCI register (CONFIG_TDP_NOMINAL) where BAR 0 is supposed to be. This is erratum HSE43 in the spec update referenced below: The PCIe* Base Specification indicates that Configuration Space Headers have a base address register at offset 0x10. Due to this erratum, the Power Control Unit's CONFIG_TDP_NOMINAL CSR (Bus 1; Device 30; Function 3; Offset 0x10) is located where a base register is expected. Mark the PCU as having non-compliant BARs so we don't try to probe any of them. There are no other BARs on this device. Rename the quirk so it's not Broadwell-specific. Link: http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html Link: http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-datasheet-vol-2.html (section 5.4, Device 30 Function 3) Link: https://bugzilla.kernel.org/show_bug.cgi?id=153881 Reported-by: Paul Menzel Tested-by: Prarit Bhargava Signed-off-by: Bjorn Helgaas Acked-by: Myron Stowe --- arch/x86/pci/fixup.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 837ea36a837d..6d52b94f4bb9 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -553,15 +553,21 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); /* - * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * Device [8086:2fc0] + * Erratum HSE43 + * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html * - * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html - * entry BDF2. + * Devices [8086:6f60,6fa0,6fc0] + * Erratum BDF2 + * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html */ -static void pci_bdwep_bar(struct pci_dev *dev) +static void pci_invalid_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); -- cgit v1.2.3 From 236dec051078a8691950f56949612b4b74107e48 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Sep 2016 16:14:47 -0700 Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings Using "make tinyconfig" produces a couple of annoying warnings that show up for build test machines all the time: .config:966:warning: override: NOHIGHMEM changes choice state .config:965:warning: override: SLOB changes choice state .config:963:warning: override: KERNEL_XZ changes choice state .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:933:warning: override: SLOB changes choice state .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:870:warning: override: SLOB changes choice state .config:868:warning: override: KERNEL_XZ changes choice state .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state I've made a previous attempt at fixing them and we discussed a number of alternatives. I tried changing the Makefile to use "merge_config.sh -n $(fragment-list)" but couldn't get that to work properly. This is yet another approach, based on the observation that we do want to see a warning for conflicting 'choice' options, and that we can simply make them non-conflicting by listing all other options as disabled. This is a trivial patch that we can apply independent of plans for other changes. Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log https://patchwork.kernel.org/patch/9212749/ Signed-off-by: Arnd Bergmann Reviewed-by: Josh Triplett Reviewed-by: Masahiro Yamada Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/configs/tiny.config | 2 ++ kernel/configs/tiny.config | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4e2ecfa23c15..4b429df40d7a 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1 +1,3 @@ CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y -- cgit v1.2.3 From 15301a570754c7af60335d094dd2d1808b0641a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 May 2016 13:47:26 -0400 Subject: x86/paravirt: Do not trace _paravirt_ident_*() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Łukasz Daniluk reported that on a RHEL kernel that his machine would lock up after enabling function tracer. I asked him to bisect the functions within available_filter_functions, which he did and it came down to three: _paravirt_nop(), _paravirt_ident_32() and _paravirt_ident_64() It was found that this is only an issue when noreplace-paravirt is added to the kernel command line. This means that those functions are most likely called within critical sections of the funtion tracer, and must not be traced. In newer kenels _paravirt_nop() is defined within gcc asm(), and is no longer an issue. But both _paravirt_ident_{32,64}() causes the following splat when they are traced: mm/pgtable-generic.c:33: bad pmd ffff8800d2435150(0000000001d00054) mm/pgtable-generic.c:33: bad pmd ffff8800d3624190(0000000001d00070) mm/pgtable-generic.c:33: bad pmd ffff8800d36a5110(0000000001d00054) mm/pgtable-generic.c:33: bad pmd ffff880118eb1450(0000000001d00054) NMI watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [systemd-journal:469] Modules linked in: e1000e CPU: 2 PID: 469 Comm: systemd-journal Not tainted 4.6.0-rc4-test+ #513 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 task: ffff880118f740c0 ti: ffff8800d4aec000 task.ti: ffff8800d4aec000 RIP: 0010:[] [] queued_spin_lock_slowpath+0x118/0x1a0 RSP: 0018:ffff8800d4aefb90 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88011eb16d40 RDX: ffffffff82485760 RSI: 000000001f288820 RDI: ffffea0000008030 RBP: ffff8800d4aefb90 R08: 00000000000c0000 R09: 0000000000000000 R10: ffffffff821c8e0e R11: 0000000000000000 R12: ffff880000200fb8 R13: 00007f7a4e3f7000 R14: ffffea000303f600 R15: ffff8800d4b562e0 FS: 00007f7a4e3d7840(0000) GS:ffff88011eb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7a4e3f7000 CR3: 00000000d3e71000 CR4: 00000000001406e0 Call Trace: _raw_spin_lock+0x27/0x30 handle_pte_fault+0x13db/0x16b0 handle_mm_fault+0x312/0x670 __do_page_fault+0x1b1/0x4e0 do_page_fault+0x22/0x30 page_fault+0x28/0x30 __vfs_read+0x28/0xe0 vfs_read+0x86/0x130 SyS_read+0x46/0xa0 entry_SYSCALL_64_fastpath+0x1e/0xa8 Code: 12 48 c1 ea 0c 83 e8 01 83 e2 30 48 98 48 81 c2 40 6d 01 00 48 03 14 c5 80 6a 5d 82 48 89 0a 8b 41 08 85 c0 75 09 f3 90 8b 41 08 <85> c0 74 f7 4c 8b 09 4d 85 c9 74 08 41 0f 18 09 eb 02 f3 90 8b Reported-by: Łukasz Daniluk Signed-off-by: Steven Rostedt Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ad5bc9578a73..1acfd76e3e26 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -56,12 +56,12 @@ asm (".pushsection .entry.text, \"ax\"\n" ".popsection"); /* identity function, which can be inlined */ -u32 _paravirt_ident_32(u32 x) +u32 notrace _paravirt_ident_32(u32 x) { return x; } -u64 _paravirt_ident_64(u64 x) +u64 notrace _paravirt_ident_64(u64 x) { return x; } -- cgit v1.2.3 From d1992996753132e2dafe955cccb2fb0714d3cfc4 Mon Sep 17 00:00:00 2001 From: Emanuel Czirai Date: Fri, 2 Sep 2016 07:35:50 +0200 Subject: x86/AMD: Apply erratum 665 on machines without a BIOS fix AMD F12h machines have an erratum which can cause DIV/IDIV to behave unpredictably. The workaround is to set MSRC001_1029[31] but sometimes there is no BIOS update containing that workaround so let's do it ourselves unconditionally. It is simple enough. [ Borislav: Wrote commit message. ] Signed-off-by: Emanuel Czirai Signed-off-by: Borislav Petkov Cc: Yaowu Xu Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160902053550.18097-1-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/amd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f5c69d8974e1..b81fe2d63e15 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -669,6 +669,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } +#define MSR_AMD64_DE_CFG 0xC0011029 + +static void init_amd_ln(struct cpuinfo_x86 *c) +{ + /* + * Apply erratum 665 fix unconditionally so machines without a BIOS + * fix work. + */ + msr_set_bit(MSR_AMD64_DE_CFG, 31); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -726,6 +737,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 6: init_amd_k7(c); break; case 0xf: init_amd_k8(c); break; case 0x10: init_amd_gh(c); break; + case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; } -- cgit v1.2.3 From cc2187a6e037bc64404f63c6d650ff263c2200c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 4 Sep 2016 11:37:36 +0200 Subject: x86/microcode/AMD: Fix load of builtin microcode with randomized memory We do not need to add the randomization offset when the microcode is built in. Reported-and-tested-by: Emanuel Czirai Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20160904093736.GA11939@pd.tnic Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/amd.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index b816971f5da4..620ab06bcf45 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,6 +54,7 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; @@ -281,18 +282,22 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - if (!load_builtin_amd_microcode(&cp, family)) + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) cp = find_ucode_in_initrd(); if (!(cp.data && cp.size)) @@ -373,7 +378,8 @@ void load_ucode_amd_ap(void) return; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); @@ -439,7 +445,8 @@ int __init save_microcode_in_initrd_amd(void) container = cont_va; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -- cgit v1.2.3 From dadb57abc37499f565b23933dbf49b435c3ba8af Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Mon, 29 Aug 2016 14:38:51 -0600 Subject: efi/libstub: Allocate headspace in efi_get_memory_map() efi_get_memory_map() allocates a buffer to store the memory map that it retrieves. This buffer may need to be reused by the client after ExitBootServices() is called, at which point allocations are not longer permitted. To support this usecase, provide the allocated buffer size back to the client, and allocate some additional headroom to account for any reasonable growth in the map that is likely to happen between the call to efi_get_memory_map() and the client reusing the buffer. Signed-off-by: Jeffrey Hugo Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Leif Lindholm Cc: Ingo Molnar Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 20 ++++-- drivers/firmware/efi/libstub/efi-stub-helper.c | 96 ++++++++++++++++++-------- drivers/firmware/efi/libstub/fdt.c | 17 +++-- drivers/firmware/efi/libstub/random.c | 12 +++- include/linux/efi.h | 15 ++-- 5 files changed, 111 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff574dad95cc..c5b7c7b4f0d7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1008,7 +1008,7 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle, bool is64) { struct efi_info *efi = &boot_params->efi_info; - unsigned long map_sz, key, desc_size; + unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; struct setup_data *e820ext; const char *signature; @@ -1019,14 +1019,20 @@ static efi_status_t exit_boot(struct boot_params *boot_params, bool called_exit = false; u8 nr_entries; int i; - - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; + struct efi_boot_memmap map; + + nr_desc = 0; + e820ext = NULL; + e820ext_size = 0; + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; get_map: - status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size, - &desc_version, &key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) return status; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 3bd127f95315..29368ac69221 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -41,6 +41,8 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; #define EFI_ALLOC_ALIGN EFI_PAGE_SIZE #endif +#define EFI_MMAP_NR_SLACK_SLOTS 8 + struct file_info { efi_file_handle_t *handle; u64 size; @@ -63,49 +65,62 @@ void efi_printk(efi_system_table_t *sys_table_arg, char *str) } } +static inline bool mmap_has_headroom(unsigned long buff_size, + unsigned long map_size, + unsigned long desc_size) +{ + unsigned long slack = buff_size - map_size; + + return slack / desc_size >= EFI_MMAP_NR_SLACK_SLOTS; +} + efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg, - efi_memory_desc_t **map, - unsigned long *map_size, - unsigned long *desc_size, - u32 *desc_ver, - unsigned long *key_ptr) + struct efi_boot_memmap *map) { efi_memory_desc_t *m = NULL; efi_status_t status; unsigned long key; u32 desc_version; - *map_size = sizeof(*m) * 32; + *map->desc_size = sizeof(*m); + *map->map_size = *map->desc_size * 32; + *map->buff_size = *map->map_size; again: - /* - * Add an additional efi_memory_desc_t because we're doing an - * allocation which may be in a new descriptor region. - */ - *map_size += sizeof(*m); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - *map_size, (void **)&m); + *map->map_size, (void **)&m); if (status != EFI_SUCCESS) goto fail; - *desc_size = 0; + *map->desc_size = 0; key = 0; - status = efi_call_early(get_memory_map, map_size, m, - &key, desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { + status = efi_call_early(get_memory_map, map->map_size, m, + &key, map->desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL || + !mmap_has_headroom(*map->buff_size, *map->map_size, + *map->desc_size)) { efi_call_early(free_pool, m); + /* + * Make sure there is some entries of headroom so that the + * buffer can be reused for a new map after allocations are + * no longer permitted. Its unlikely that the map will grow to + * exceed this headroom once we are ready to trigger + * ExitBootServices() + */ + *map->map_size += *map->desc_size * EFI_MMAP_NR_SLACK_SLOTS; + *map->buff_size = *map->map_size; goto again; } if (status != EFI_SUCCESS) efi_call_early(free_pool, m); - if (key_ptr && status == EFI_SUCCESS) - *key_ptr = key; - if (desc_ver && status == EFI_SUCCESS) - *desc_ver = desc_version; + if (map->key_ptr && status == EFI_SUCCESS) + *map->key_ptr = key; + if (map->desc_ver && status == EFI_SUCCESS) + *map->desc_ver = desc_version; fail: - *map = m; + *map->map = m; return status; } @@ -113,13 +128,20 @@ fail: unsigned long get_dram_base(efi_system_table_t *sys_table_arg) { efi_status_t status; - unsigned long map_size; + unsigned long map_size, buff_size; unsigned long membase = EFI_ERROR; struct efi_memory_map map; efi_memory_desc_t *md; + struct efi_boot_memmap boot_map; - status = efi_get_memory_map(sys_table_arg, (efi_memory_desc_t **)&map.map, - &map_size, &map.desc_size, NULL, NULL); + boot_map.map = (efi_memory_desc_t **)&map.map; + boot_map.map_size = &map_size; + boot_map.desc_size = &map.desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; + + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) return membase; @@ -144,15 +166,22 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, unsigned long *addr, unsigned long max) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; efi_memory_desc_t *map; efi_status_t status; unsigned long nr_pages; u64 max_addr = 0; int i; + struct efi_boot_memmap boot_map; + + boot_map.map = ↦ + boot_map.map_size = &map_size; + boot_map.desc_size = &desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; - status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size, - NULL, NULL); + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) goto fail; @@ -230,14 +259,21 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, unsigned long *addr) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; efi_memory_desc_t *map; efi_status_t status; unsigned long nr_pages; int i; + struct efi_boot_memmap boot_map; + + boot_map.map = ↦ + boot_map.map_size = &map_size; + boot_map.desc_size = &desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; - status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size, - NULL, NULL); + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) goto fail; diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index e58abfa953cc..bec0fa8d8746 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -175,13 +175,21 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, unsigned long fdt_addr, unsigned long fdt_size) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; u32 desc_ver; unsigned long mmap_key; efi_memory_desc_t *memory_map, *runtime_map; unsigned long new_fdt_size; efi_status_t status; int runtime_entry_count = 0; + struct efi_boot_memmap map; + + map.map = &runtime_map; + map.map_size = &map_size; + map.desc_size = &desc_size; + map.desc_ver = &desc_ver; + map.key_ptr = &mmap_key; + map.buff_size = &buff_size; /* * Get a copy of the current memory map that we will use to prepare @@ -189,8 +197,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, * subsequent allocations adding entries, since they could not affect * the number of EFI_MEMORY_RUNTIME regions. */ - status = efi_get_memory_map(sys_table, &runtime_map, &map_size, - &desc_size, &desc_ver, &mmap_key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) { pr_efi_err(sys_table, "Unable to retrieve UEFI memory map.\n"); return status; @@ -199,6 +206,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, pr_efi(sys_table, "Exiting boot services and installing virtual address map...\n"); + map.map = &memory_map; /* * Estimate size of new FDT, and allocate memory for it. We * will allocate a bigger buffer if this ends up being too @@ -218,8 +226,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, * we can get the memory map key needed for * exit_boot_services(). */ - status = efi_get_memory_map(sys_table, &memory_map, &map_size, - &desc_size, &desc_ver, &mmap_key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) goto fail_free_new_fdt; diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index 53f6d3fe6d86..0c9f58c5ba50 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -73,12 +73,20 @@ efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, unsigned long random_seed) { unsigned long map_size, desc_size, total_slots = 0, target_slot; + unsigned long buff_size; efi_status_t status; efi_memory_desc_t *memory_map; int map_offset; + struct efi_boot_memmap map; - status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size, - &desc_size, NULL, NULL); + map.map = &memory_map; + map.map_size = &map_size; + map.desc_size = &desc_size; + map.desc_ver = NULL; + map.key_ptr = NULL; + map.buff_size = &buff_size; + + status = efi_get_memory_map(sys_table_arg, &map); if (status != EFI_SUCCESS) return status; diff --git a/include/linux/efi.h b/include/linux/efi.h index 23cd3ced8c1a..943fee524176 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -118,6 +118,15 @@ typedef struct { u32 imagesize; } efi_capsule_header_t; +struct efi_boot_memmap { + efi_memory_desc_t **map; + unsigned long *map_size; + unsigned long *desc_size; + u32 *desc_ver; + unsigned long *key_ptr; + unsigned long *buff_size; +}; + /* * EFI capsule flags */ @@ -1371,11 +1380,7 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg, efi_loaded_image_t *image, int *cmd_line_len); efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg, - efi_memory_desc_t **map, - unsigned long *map_size, - unsigned long *desc_size, - u32 *desc_ver, - unsigned long *key_ptr); + struct efi_boot_memmap *map); efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, -- cgit v1.2.3 From d64934019f6cc39202e2f78063709f61ca5cb364 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Mon, 29 Aug 2016 14:38:54 -0600 Subject: x86/efi: Use efi_exit_boot_services() The eboot code directly calls ExitBootServices. This is inadvisable as the UEFI spec details a complex set of errors, race conditions, and API interactions that the caller of ExitBootServices must get correct. The eboot code attempts allocations after calling ExitBootSerives which is not permitted per the spec. Call the efi_exit_boot_services() helper intead, which handles the allocation scenario properly. Signed-off-by: Jeffrey Hugo Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Leif Lindholm Cc: Ingo Molnar Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 136 +++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c5b7c7b4f0d7..94dd4a31f5b3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1004,85 +1004,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, return status; } +struct exit_boot_struct { + struct boot_params *boot_params; + struct efi_info *efi; + struct setup_data *e820ext; + __u32 e820ext_size; + bool is64; +}; + +static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, + struct efi_boot_memmap *map, + void *priv) +{ + static bool first = true; + const char *signature; + __u32 nr_desc; + efi_status_t status; + struct exit_boot_struct *p = priv; + + if (first) { + nr_desc = *map->buff_size / *map->desc_size; + if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) { + u32 nr_e820ext = nr_desc - + ARRAY_SIZE(p->boot_params->e820_map); + + status = alloc_e820ext(nr_e820ext, &p->e820ext, + &p->e820ext_size); + if (status != EFI_SUCCESS) + return status; + } + first = false; + } + + signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); + + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; + +#ifdef CONFIG_X86_64 + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; +#endif + + return EFI_SUCCESS; +} + static efi_status_t exit_boot(struct boot_params *boot_params, void *handle, bool is64) { - struct efi_info *efi = &boot_params->efi_info; unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; struct setup_data *e820ext; - const char *signature; __u32 e820ext_size; - __u32 nr_desc, prev_nr_desc; efi_status_t status; __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; struct efi_boot_memmap map; + struct exit_boot_struct priv; + + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; + priv.is64 = is64; - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - -get_map: - status = efi_get_memory_map(sys_table, &map); - + /* Might as well exit boot services now */ + status = efi_exit_boot_services(sys_table, handle, &map, &priv, + exit_boot_func); if (status != EFI_SUCCESS) return status; - prev_nr_desc = nr_desc; - nr_desc = map_sz / desc_size; - if (nr_desc > prev_nr_desc && - nr_desc > ARRAY_SIZE(boot_params->e820_map)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map); - - status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size); - if (status != EFI_SUCCESS) - goto free_mem_map; - - efi_call_early(free_pool, mem_map); - goto get_map; /* Allocated memory, get map again */ - } - - signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; - memcpy(&efi->efi_loader_signature, signature, sizeof(__u32)); - - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = map_sz; - -#ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; -#endif - - /* Might as well exit boot services now */ - status = efi_call_early(exit_boot_services, handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; - - called_exit = true; - efi_call_early(free_pool, mem_map); - goto get_map; - } - + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; /* Historic? */ boot_params->alt_mem_k = 32 * 1024; @@ -1091,10 +1093,6 @@ get_map: return status; return EFI_SUCCESS; - -free_mem_map: - efi_call_early(free_pool, mem_map); - return status; } /* -- cgit v1.2.3 From e12c8f36f3f7a60d55938c5aed5999278fa92bcb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 30 Aug 2016 16:14:00 +0800 Subject: KVM: lapic: adjust preemption timer correctly when goes TSC backward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TSC_OFFSET will be adjusted if discovers TSC backward during vCPU load. The preemption timer, which relies on the guest tsc to reprogram its preemption timer value, is also reprogrammed if vCPU is scheded in to a different pCPU. However, the current implementation reprogram preemption timer before TSC_OFFSET is adjusted to the right value, resulting in the preemption timer firing prematurely. This patch fix it by adjusting TSC_OFFSET before reprogramming preemption timer if TSC backward. Cc: Paolo Bonzini Cc: Radim Krċmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19f9f9e05c2a..699f8726539a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2743,16 +2743,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration -- cgit v1.2.3 From 79d102cbfd2e9d94257fcc7c82807ef1cdf80322 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 5 Sep 2016 17:30:07 +0200 Subject: perf/x86/intel/cqm: Check cqm/mbm enabled state in event init Yanqiu Zhang reported kernel panic when using mbm event on system where CQM is detected but without mbm event support, like with perf: # perf stat -e 'intel_cqm/event=3/' -a BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 IP: [] update_sample+0xbc/0xe0 ... [] __intel_mbm_event_init+0x18/0x20 [] flush_smp_call_function_queue+0x7b/0x160 [] generic_smp_call_function_single_interrupt+0x13/0x60 [] smp_call_function_interrupt+0x27/0x40 [] call_function_interrupt+0x8c/0xa0 ... The reason is that we currently allow to init mbm event even if mbm support is not detected. Adding checks for both cqm and mbm events and support into cqm's event_init. Fixes: 33c3cc7acfd9 ("perf/x86/mbm: Add Intel Memory B/W Monitoring enumeration and init") Reported-by: Yanqiu Zhang Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Vikas Shivappa Cc: Tony Luck Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1473089407-21857-1-git-send-email-jolsa@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/cqm.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 783c49ddef29..8f82b02934fa 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info); static void init_mbm_sample(u32 rmid, u32 evt_type); static void __intel_mbm_event_count(void *info); +static bool is_cqm_event(int e) +{ + return (e == QOS_L3_OCCUP_EVENT_ID); +} + static bool is_mbm_event(int e) { return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); @@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event) (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; + if ((is_cqm_event(event->attr.config) && !cqm_enabled) || + (is_mbm_event(event->attr.config) && !mbm_enabled)) + return -EINVAL; + /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || -- cgit v1.2.3 From e6971009a95a74f28c58bbae415c40effad1226c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c3f291195294..e3af86f58eaf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -705,7 +705,7 @@ static inline void copy_user_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -725,7 +725,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); -- cgit v1.2.3 From ce29856a5e1aabe52e18b2c60db1490769a6ab55 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Mon, 1 Aug 2016 23:01:56 +0200 Subject: um/ptrace: Fix the syscall number update after a ptrace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the syscall number after each PTRACE_SETREGS on ORIG_*AX. This is needed to get the potentially altered syscall number in the seccomp filters after RET_TRACE. This fix four seccomp_bpf tests: > [ RUN ] TRACE_syscall.skip_after_RET_TRACE > seccomp_bpf.c:1560:TRACE_syscall.skip_after_RET_TRACE:Expected -1 (18446744073709551615) == syscall(39) (26) > seccomp_bpf.c:1561:TRACE_syscall.skip_after_RET_TRACE:Expected 1 (1) == (*__errno_location ()) (22) > [ FAIL ] TRACE_syscall.skip_after_RET_TRACE > [ RUN ] TRACE_syscall.kill_after_RET_TRACE > TRACE_syscall.kill_after_RET_TRACE: Test exited normally instead of by signal (code: 1) > [ FAIL ] TRACE_syscall.kill_after_RET_TRACE > [ RUN ] TRACE_syscall.skip_after_ptrace > seccomp_bpf.c:1622:TRACE_syscall.skip_after_ptrace:Expected -1 (18446744073709551615) == syscall(39) (26) > seccomp_bpf.c:1623:TRACE_syscall.skip_after_ptrace:Expected 1 (1) == (*__errno_location ()) (22) > [ FAIL ] TRACE_syscall.skip_after_ptrace > [ RUN ] TRACE_syscall.kill_after_ptrace > TRACE_syscall.kill_after_ptrace: Test exited normally instead of by signal (code: 1) > [ FAIL ] TRACE_syscall.kill_after_ptrace Fixes: 26703c636c1f ("um/ptrace: run seccomp after ptrace") Signed-off-by: Mickaël Salaün Acked-by: Kees Cook Cc: Jeff Dike Cc: Richard Weinberger Cc: James Morris Cc: user-mode-linux-devel@lists.sourceforge.net Signed-off-by: James Morris Signed-off-by: Kees Cook --- arch/um/kernel/skas/syscall.c | 5 ----- arch/x86/um/ptrace_32.c | 3 +++ arch/x86/um/ptrace_64.c | 4 ++++ 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 0728fee94398..b783ac87d98a 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -27,12 +27,7 @@ void handle_syscall(struct uml_pt_regs *r) if (secure_computing(NULL) == -1) goto out; - /* Update the syscall number after orig_ax has potentially been updated - * with ptrace. - */ - UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall <= __NR_syscall_max) PT_REGS_SET_SYSCALL_RETURN(regs, EXECUTE_SYSCALL(syscall, regs)); diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ebd4dd6ef73b..a7ef7b131e25 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -84,7 +84,10 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + break; case ORIG_EAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: if (value && (value & 3) != 3) diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index faab418876ce..0b5c184dd5b3 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -78,7 +78,11 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case RSI: case RDI: case RBP: + break; + case ORIG_RAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: -- cgit v1.2.3 From c291b015158577be533dd5a959dfc09bab119eed Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 7 Sep 2016 10:21:33 +0800 Subject: x86/apic: Fix num_processors value in case of failure If the topology package map check of the APIC ID and the CPU is a failure, we don't generate the processor info for that APIC ID yet we increase disabled_cpus by one - which is buggy. Only increase num_processors once we are sure we don't fail. Signed-off-by: Dou Liyang Acked-by: David Rientjes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1473214893-16481-1-git-send-email-douly.fnst@cn.fujitsu.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50c95af0f017..f3e9b2df4b16 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2093,7 +2093,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2116,10 +2115,13 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } + num_processors++; + /* * Validate version */ -- cgit v1.2.3 From a4497a86fb9b855c5ac8503fdc959393b00bb643 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 8 Sep 2016 10:15:28 -0400 Subject: x86, clock: Fix kvm guest tsc initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When booting a kvm guest on AMD with the latest kernel the following messages are displayed in the boot log: tsc: Unable to calibrate against PIT tsc: HPET/PMTIMER calibration failed aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") introduced a change to account for a difference in cpu and tsc frequencies for Intel SKL processors. Before this change the native tsc set x86_platform.calibrate_tsc to native_calibrate_tsc() which is a hardware calibration of the tsc, and in tsc_init() executed tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; The kvm code changed x86_platform.calibrate_tsc to kvm_get_tsc_khz() and executed the same tsc_init() function. This meant that KVM guests did not execute the native hardware calibration function. After aa297292d708, there are separate native calibrations for cpu_khz and tsc_khz. The code sets x86_platform.calibrate_tsc to native_calibrate_tsc() which is now an Intel specific calibration function, and x86_platform.calibrate_cpu to native_calibrate_cpu() which is the "old" native_calibrate_tsc() function (ie, the native hardware calibration function). tsc_init() now does cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; The kvm code should not call the hardware initialization in native_calibrate_cpu(), as it isn't applicable for kvm and it didn't do that prior to aa297292d708. This patch resolves this issue by setting x86_platform.calibrate_cpu to kvm_get_tsc_khz(). v2: I had originally set x86_platform.calibrate_cpu to cpu_khz_from_cpuid(), however, pbonzini pointed out that the CPUID leaf in that function is not available in KVM. I have changed the function pointer to kvm_get_tsc_khz(). Fixes: aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") Signed-off-by: Prarit Bhargava Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Len Brown Cc: "Peter Zijlstra (Intel)" Cc: Borislav Petkov Cc: Adrian Hunter Cc: "Christopher S. Hall" Cc: David Woodhouse Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..3692249a70f1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3 From 7d762e49c2117d3829eb3355f2617aea080ed3a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 9 Sep 2016 18:08:23 +0200 Subject: perf/x86/amd/uncore: Prevent use after free The resent conversion of the cpu hotplug support in the uncore driver introduced a regression due to the way the callbacks are invoked at initialization time. The old code called the prepare/starting/online function on each online cpu as a block. The new code registers the hotplug callbacks in the core for each state. The core invokes the callbacks at each registration on all online cpus. The code implicitely relied on the prepare/starting/online callbacks being called as combo on a particular cpu, which was not obvious and completely undocumented. The resulting subtle wreckage happens due to the way how the uncore code manages shared data structures for cpus which share an uncore resource in hardware. The sharing is determined in the cpu starting callback, but the prepare callback allocates per cpu data for the upcoming cpu because potential sharing is unknown at this point. If the starting callback finds a online cpu which shares the hardware resource it takes a refcount on the percpu data of that cpu and puts the own data structure into a 'free_at_online' pointer of that shared data structure. The online callback frees that. With the old model this worked because in a starting callback only one non unused structure (the one of the starting cpu) was available. The new code allocates the data structures for all cpus when the prepare callback is registered. Now the starting function iterates through all online cpus and looks for a data structure (skipping its own) which has a matching hardware id. The id member of the data structure is initialized to 0, but the hardware id can be 0 as well. The resulting wreckage is: CPU0 finds a matching id on CPU1, takes a refcount on CPU1 data and puts its own data structure into CPU1s data structure to be freed. CPU1 skips CPU0 because the data structure is its allegedly unsued own. It finds a matching id on CPU2, takes a refcount on CPU1 data and puts its own data structure into CPU2s data structure to be freed. .... Now the online callbacks are invoked. CPU0 has a pointer to CPU1s data and frees the original CPU0 data. So far so good. CPU1 has a pointer to CPU2s data and frees the original CPU1 data, which is still referenced by CPU0 ---> Booom So there are two issues to be solved here: 1) The id field must be initialized at allocation time to a value which cannot be a valid hardware id, i.e. -1 This prevents the above scenario, but now CPU1 and CPU2 both stick their own data structure into the free_at_online pointer of CPU0. So we leak CPU1s data structure. 2) Fix the memory leak described in #1 Instead of having a single pointer, use a hlist to enqueue the superflous data structures which are then freed by the first cpu invoking the online callback. Ideally we should know the sharing _before_ invoking the prepare callback, but that's way beyond the scope of this bug fix. [ tglx: Rewrote changelog ] Fixes: 96b2bd3866a0 ("perf/x86/amd/uncore: Convert to hotplug state machine") Reported-and-tested-by: Eric Sandeen Signed-off-by: Sebastian Andrzej Siewior Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20160909160822.lowgmkdwms2dheyv@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/events/amd/uncore.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index e6131d4454e6..65577f081d07 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -29,6 +29,8 @@ #define COUNTER_SHIFT 16 +static HLIST_HEAD(uncore_unused_list); + struct amd_uncore { int id; int refcnt; @@ -39,7 +41,7 @@ struct amd_uncore { cpumask_t *active_mask; struct pmu *pmu; struct perf_event *events[MAX_COUNTERS]; - struct amd_uncore *free_when_cpu_online; + struct hlist_node node; }; static struct amd_uncore * __percpu *amd_uncore_nb; @@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } @@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_l2->active_mask = &amd_l2_active_mask; uncore_l2->pmu = &amd_l2_pmu; + uncore_l2->id = -1; *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } @@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this, continue; if (this->id == that->id) { - that->free_when_cpu_online = this; + hlist_add_head(&this->node, &uncore_unused_list); this = that; break; } @@ -388,13 +392,23 @@ static int amd_uncore_cpu_starting(unsigned int cpu) return 0; } +static void uncore_clean_online(void) +{ + struct amd_uncore *uncore; + struct hlist_node *n; + + hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { + hlist_del(&uncore->node); + kfree(uncore); + } +} + static void uncore_online(unsigned int cpu, struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - kfree(uncore->free_when_cpu_online); - uncore->free_when_cpu_online = NULL; + uncore_clean_online(); if (cpu == uncore->cpu) cpumask_set_cpu(cpu, uncore->active_mask); -- cgit v1.2.3 From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- arch/x86/mm/pat.c | 17 ++++++++++------- kernel/memremap.c | 9 +++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit v1.2.3 From a9a94401c2b5805c71e39427b1af1bf1b9f67cd0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:51 +0300 Subject: perf/x86/intel/bts: Fix confused ordering of PMU callbacks The intel_bts driver is using a CPU-local 'started' variable to order callbacks and PMIs and make sure that AUX transactions don't get messed up. However, the ordering rules in regard to this variable is a complete mess, which recently resulted in perf_fuzzer-triggered warnings and panics. The general ordering rule that is patch is enforcing is that this cpu-local variable be set only when the cpu-local AUX transaction is active; consequently, this variable is to be checked before the AUX related bits can be touched. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-4-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 104 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 0a6e393a2e62..61e1d713b114 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -31,7 +31,17 @@ struct bts_ctx { struct perf_output_handle handle; struct debug_store ds_back; - int started; + int state; +}; + +/* BTS context states: */ +enum { + /* no ongoing AUX transactions */ + BTS_STATE_STOPPED = 0, + /* AUX transaction is on, BTS tracing is disabled */ + BTS_STATE_INACTIVE, + /* AUX transaction is on, BTS tracing is running */ + BTS_STATE_ACTIVE, }; static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); @@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts) static int bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +/* + * Ordering PMU callbacks wrt themselves and the PMI is done by means + * of bts::state, which: + * - is set when bts::handle::event is valid, that is, between + * perf_aux_output_begin() and perf_aux_output_end(); + * - is zero otherwise; + * - is ordered against bts::handle::event with a compiler barrier. + */ + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); @@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event) /* * local barrier to make sure that ds configuration made it - * before we enable BTS + * before we enable BTS and bts::state goes ACTIVE */ wmb(); + /* INACTIVE/STOPPED -> ACTIVE */ + WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); + intel_pmu_enable_bts(config); } @@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags) __bts_event_start(event); - /* PMI handler: this counter is running and likely generating PMIs */ - ACCESS_ONCE(bts->started) = 1; - return; fail_end_stop: @@ -263,30 +282,34 @@ fail_stop: event->hw.state = PERF_HES_STOPPED; } -static void __bts_event_stop(struct perf_event *event) +static void __bts_event_stop(struct perf_event *event, int state) { + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ + WRITE_ONCE(bts->state, state); + /* * No extra synchronization is mandated by the documentation to have * BTS data stores globally visible. */ intel_pmu_disable_bts(); - - if (event->hw.state & PERF_HES_STOPPED) - return; - - ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; } static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *buf = NULL; + int state = READ_ONCE(bts->state); + + if (state == BTS_STATE_ACTIVE) + __bts_event_stop(event, BTS_STATE_STOPPED); - /* PMI handler: don't restart this counter */ - ACCESS_ONCE(bts->started) = 0; + if (state != BTS_STATE_STOPPED) + buf = perf_get_aux(&bts->handle); - __bts_event_stop(event); + event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); @@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); } @@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + int state = READ_ONCE(bts->state); + + /* + * Here we transition from INACTIVE to ACTIVE; + * if we instead are STOPPED from the interrupt handler, + * stay that way. Can't be ACTIVE here though. + */ + if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) + return; + + if (state == BTS_STATE_STOPPED) + return; - if (bts->handle.event && bts->started) + if (bts->handle.event) __bts_event_start(bts->handle.event); } @@ -319,8 +355,15 @@ void intel_bts_disable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + /* + * Here we transition from ACTIVE to INACTIVE; + * do nothing for STOPPED or INACTIVE. + */ + if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) + return; + if (bts->handle.event) - __bts_event_stop(bts->handle.event); + __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE); } static int @@ -407,9 +450,13 @@ int intel_bts_interrupt(void) struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err; + int err = -ENOSPC; - if (!event || !bts->started) + /* + * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, + * so we can only be INACTIVE or STOPPED + */ + if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) return 0; buf = perf_get_aux(&bts->handle); @@ -432,12 +479,21 @@ int intel_bts_interrupt(void) !!local_xchg(&buf->lost, 0)); buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return 1; + if (buf) + err = bts_buffer_reset(buf, &bts->handle); - err = bts_buffer_reset(buf, &bts->handle); - if (err) - perf_aux_output_end(&bts->handle, 0, false); + if (err) { + WRITE_ONCE(bts->state, BTS_STATE_STOPPED); + + if (buf) { + /* + * BTS_STATE_STOPPED should be visible before + * cleared handle::event + */ + barrier(); + perf_aux_output_end(&bts->handle, 0, false); + } + } return 1; } -- cgit v1.2.3 From 4d4c474124649198d9b0a065c06f9362cf18e14e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:52 +0300 Subject: perf/x86/intel/bts: Fix BTS PMI detection Since BTS doesn't have a dedicated PMI status bit, the driver needs to take extra care to check for the condition that triggers it to avoid spurious NMI warnings. Regardless of the local BTS context state, the only way of knowing that the NMI is ours is to compare the write pointer against the interrupt threshold. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-5-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61e1d713b114..9233edf993e1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -446,26 +446,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { + struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err = -ENOSPC; + int err = -ENOSPC, handled = 0; + + /* + * The only surefire way of knowing if this NMI is ours is by checking + * the write ptr against the PMI threshold. + */ + if (ds->bts_index >= ds->bts_interrupt_threshold) + handled = 1; /* * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, * so we can only be INACTIVE or STOPPED */ if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) - return 0; + return handled; buf = perf_get_aux(&bts->handle); + if (!buf) + return handled; + /* * Skip snapshot counters: they don't use the interrupt, but * there's no other way of telling, because the pointer will * keep moving */ - if (!buf || buf->snapshot) + if (buf->snapshot) return 0; old_head = local_read(&buf->head); @@ -473,7 +484,7 @@ int intel_bts_interrupt(void) /* no new data */ if (old_head == local_read(&buf->head)) - return 0; + return handled; perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); -- cgit v1.2.3 From ef9ef3befa0d76008e988a9ed9fe439e803351b9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:53 +0300 Subject: perf/x86/intel/bts: Kill a silly warning At the moment, intel_bts will WARN() out if there is more than one event writing to the same ring buffer, via SET_OUTPUT, and will only send data from one event to a buffer. There is no reason to have this warning in, so kill it. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-6-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 9233edf993e1..bdcd6510992c 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -378,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) return 0; head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); - if (WARN_ON_ONCE(head != local_read(&buf->head))) - return -EINVAL; phys = &buf->buf[buf->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; -- cgit v1.2.3 From 8ef9b8455a2a3049efa9e46e8a6402b972a3eb41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Sep 2016 14:42:55 +0200 Subject: perf/x86/intel: Fix PEBSv3 record drain Alexander hit the WARN_ON_ONCE(!event) on his Skylake while running the perf fuzzer. This means the PEBSv3 record included a status bit for an inactive event, something that _should_ not happen. Move the code that filters the status bits against our known PEBS events up a spot to guarantee we only deal with events we know about. Further add "continue" statements to the WARN_ON_ONCE()s such that we'll not die nor generate silly events in case we ever do hit them again. Reported-by: Alexander Shishkin Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Fixes: a3d86542de88 ("perf/x86/intel/pebs: Add PEBSv3 decoding") Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ce9f3f669e6..9b983a474253 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1274,18 +1274,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct pebs_record_nhm *p = at; u64 pebs_status; - /* PEBS v3 has accurate status bits */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&p->status, - MAX_PEBS_EVENTS) + for_each_set_bit(bit, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) counts[bit]++; continue; } - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; - /* * On some CPUs the PEBS status can be zero when PEBS is * racing with clearing of GLOBAL_STATUS. @@ -1333,8 +1333,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; event = cpuc->events[bit]; - WARN_ON_ONCE(!event); - WARN_ON_ONCE(!event->attr.precise_ip); + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; /* log dropped samples number */ if (error[bit]) -- cgit v1.2.3 From cecf62352aee2b4fe114aafd1b8c5f265a4243ce Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 11:22:33 +0300 Subject: perf/x86/intel: Don't disable "intel_bts" around "intel" event batching At the moment, intel_bts events get disabled from intel PMU's disable callback, which includes event scheduling transactions of said PMU, which have nothing to do with intel_bts events. We do want to keep intel_bts events off inside the PMI handler to avoid filling up their buffer too soon. This patch moves intel_bts enabling/disabling directly to the PMI handler. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915082233.11065-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2cbde2f449aa..4c9a79b9cd69 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1730,9 +1730,11 @@ static __initconst const u64 knl_hw_cache_extra_regs * disabled state if called consecutively. * * During consecutive calls, the same disable value will be written to related - * registers, so the PMU state remains unchanged. hw.state in - * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive - * calls. + * registers, so the PMU state remains unchanged. + * + * intel_bts events don't coexist with intel PMU's BTS events because of + * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them + * disabled around intel PMU's event batching etc, only inside the PMI handler. */ static void __intel_pmu_disable_all(void) { @@ -1742,8 +1744,6 @@ static void __intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - else - intel_bts_disable_local(); intel_pmu_pebs_disable_all(); } @@ -1771,8 +1771,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) return; intel_pmu_enable_bts(event->hw.config); - } else - intel_bts_enable_local(); + } } static void intel_pmu_enable_all(int added) @@ -2073,6 +2072,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2172,6 +2172,7 @@ done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ if (cpuc->enabled) __intel_pmu_enable_all(0, true); + intel_bts_enable_local(); /* * Only unmask the NMI after the overflow counters -- cgit v1.2.3 From b0eaf4506f5f95d15d6731d72c0ddf4a2179eefa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Sep 2016 23:39:12 +0200 Subject: kvm: x86: correctly reset dest_map->vector when restoring LAPIC state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When userspace sends KVM_SET_LAPIC, KVM schedules a check between the vCPU's IRR and ISR and the IOAPIC redirection table, in order to re-establish the IOAPIC's dest_map (the list of CPUs servicing the real-time clock interrupt with the corresponding vectors). However, __rtc_irq_eoi_tracking_restore_one was forgetting to set dest_map->vectors. Because of this, the IOAPIC did not process the real-time clock interrupt EOI, ioapic->rtc_status.pending_eoi got stuck at a non-zero value, and further RTC interrupts were reported to userspace as coalesced. Fixes: 9e4aabe2bb3454c83dac8139cf9974503ee044db Fixes: 4d99ba898dd0c521ca6cdfdde55c9b58aea3cb3d Cc: stable@vger.kernel.org Cc: Joerg Roedel Cc: David Gilbert Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 5f42d038fcb4..c7220ba94aa7 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -109,6 +109,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; @@ -117,16 +118,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + old_val = test_bit(vcpu->vcpu_id, dest_map->map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = e->fields.vector; ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __clear_bit(vcpu->vcpu_id, dest_map->map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } -- cgit v1.2.3 From 1c109fabbd51863475cd12ac206bdd249aee35af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Sep 2016 02:35:29 +0100 Subject: fix minor infoleak in get_user_ex() get_user_ex(x, ptr) should zero x on failure. It's not a lot of a leak (at most we are leaking uninitialized 64bit value off the kernel stack, and in a fairly constrained situation, at that), but the fix is trivial, so... Cc: stable@vger.kernel.org Signed-off-by: Al Viro [ This sat in different branch from the uaccess fixes since mid-August ] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e3af86f58eaf..2131c4ce7d8a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -433,7 +433,11 @@ do { \ #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ "2:\n" \ - _ASM_EXTABLE_EX(1b, 2b) \ + ".section .fixup,\"ax\"\n" \ + "3:xor"itype" %"rtype"0,%"rtype"0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_EX(1b, 3b) \ : ltype(x) : "m" (__m(addr))) #define __put_user_nocheck(x, ptr, size) \ -- cgit v1.2.3 From 95f60084acbcee6c466256cf26eb52191fad9edc Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:50 +0300 Subject: perf/x86/intel/pt: Fix an off-by-one in address filter configuration PT address filter configuration requires that a range is specified by its first and last address, but at the moment we're obtaining the end of the range by adding user specified size to its start, which is off by one from what it actually needs to be. Fix this and make sure that zero-sized filters don't pass the filter validation. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 04bb5fb5a8d7..5ec0100e3fc6 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1081,7 +1081,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) list_for_each_entry(filter, filters, entry) { /* PT doesn't support single address triggers */ - if (!filter->range) + if (!filter->range || !filter->size) return -EOPNOTSUPP; if (!filter->inode && !kernel_ip(filter->offset)) @@ -1111,7 +1111,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event) } else { /* apply the offset */ msr_a = filter->offset + offs[range]; - msr_b = filter->size + msr_a; + msr_b = filter->size + msr_a - 1; } filters->filter[range].msr_a = msr_a; -- cgit v1.2.3 From ddfdad991e55b65c1cc4ee29502f6dceee04455a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:51 +0300 Subject: perf/x86/intel/pt: Fix kernel address filter's offset validation The kernel_ip() filter is used mostly by the DS/LBR code to look at the branch addresses, but Intel PT also uses it to validate the address filter offsets for kernel addresses, for which it is not sufficient: supplying something in bits 64:48 that's not a sign extension of the lower address bits (like 0xf00d000000000000) throws a #GP. This patch adds address validation for the user supplied kernel filters. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 5ec0100e3fc6..1f94963a283b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1074,6 +1074,11 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } +static inline bool valid_kernel_ip(unsigned long ip) +{ + return virt_addr_valid(ip) && kernel_ip(ip); +} + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1084,7 +1089,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !kernel_ip(filter->offset)) + if (!filter->inode && !valid_kernel_ip(filter->offset)) return -EINVAL; if (++range > pt_cap_get(PT_CAP_num_address_ranges)) -- cgit v1.2.3 From 1155bafcb79208abc6ae234c6e135ac70607755c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:52 +0300 Subject: perf/x86/intel/pt: Do validate the size of a kernel address filter Right now, the kernel address filters in PT are prone to integer overflow that may happen in adding filter's size to its offset to obtain the end of the range. Such an overflow would also throw a #GP in the PT event configuration path. Fix this by explicitly validating the result of this calculation. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-4-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1f94963a283b..861a7d9cb60f 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1089,8 +1089,13 @@ static int pt_event_addr_filters_validate(struct list_head *filters) if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !valid_kernel_ip(filter->offset)) - return -EINVAL; + if (!filter->inode) { + if (!valid_kernel_ip(filter->offset)) + return -EINVAL; + + if (!valid_kernel_ip(filter->offset + filter->size)) + return -EINVAL; + } if (++range > pt_cap_get(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; -- cgit v1.2.3 From 080fe0b790ad438fc1b61621dac37c1964ce7f35 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 24 Aug 2016 14:12:08 +0100 Subject: perf/x86/amd: Make HW_CACHE_REFERENCES and HW_CACHE_MISSES measure L2 While the Intel PMU monitors the LLC when perf enables the HW_CACHE_REFERENCES and HW_CACHE_MISSES events, these events monitor L1 instruction cache fetches (0x0080) and instruction cache misses (0x0081) on the AMD PMU. This is extremely confusing when monitoring the same workload across Intel and AMD machines, since parameters like, $ perf stat -e cache-references,cache-misses measure completely different things. Instead, make the AMD PMU measure instruction/data cache and TLB fill requests to the L2 and instruction/data cache and TLB misses in the L2 when HW_CACHE_REFERENCES and HW_CACHE_MISSES are enabled, respectively. That way the events measure unified caches on both platforms. Signed-off-by: Matt Fleming Acked-by: Peter Zijlstra Cc: Cc: Borislav Petkov Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1472044328-21302-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/events/amd/core.c | 4 ++-- arch/x86/kvm/pmu_amd.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e07a22bb9308..f5f4b3fbbbc2 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -119,8 +119,8 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 39b91127ef07..cd944435dfbd 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -23,8 +23,8 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, -- cgit v1.2.3