From 2406e3b166eee42777a6b0b38f52f924454474d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Sep 2017 21:36:56 +0200 Subject: perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround The lockup_detector_suspend/resume() interface is broken in several ways especially as it results in recursive locking of the CPU hotplug lock. Use the new stop/restart interface in the perf NMI watchdog to temporarily disable and reenable the already active watchdog events. That's enough to handle it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Don Zickus Cc: Andrew Morton Cc: Borislav Petkov Cc: Chris Metcalf Cc: Linus Torvalds Cc: Nicholas Piggin Cc: Sebastian Siewior Cc: Ulrich Obergfell Link: http://lkml.kernel.org/r/20170912194146.247141871@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 829e89cfcee2..9fb9a1f1e47b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void) return 0; } - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } + cpus_read_lock(); + + hardlockup_detector_perf_stop(); x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - lockup_detector_resume(); - - cpus_read_lock(); + hardlockup_detector_perf_restart(); for_each_online_cpu(c) free_excl_cntrs(c); -- cgit v1.2.3 From ee213fc72fd67d0988525af501534f4cb924d1e9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Oct 2017 08:51:43 -0500 Subject: kprobes/x86: Set up frame pointer in kprobe trampoline Richard Weinberger saw an unwinder warning when running bcc's opensnoop: WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008 unwind stack type:0 next_sp: (null) mask:0x2 graph_idx:0 ... ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0) ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90) ... A lockdep stack trace was initiated from inside a kprobe handler, when the unwinder noticed a bad frame pointer on the stack. The bad frame pointer is related to the fact that the kprobe optprobe trampoline doesn't save the frame pointer before calling into optimized_callback(). Reported-and-tested-by: Richard Weinberger Signed-off-by: Josh Poimboeuf Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/common.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index db2182d63ed0..3fc0f9a794cb 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -3,6 +3,15 @@ /* Kprobes and Optprobes common header */ +#include + +#ifdef CONFIG_FRAME_POINTER +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ + " mov %" _ASM_SP ", %" _ASM_BP "\n" +#else +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#endif + #ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ @@ -17,7 +26,7 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - " pushq %rbp\n" \ + SAVE_RBP_STRING \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ @@ -48,7 +57,7 @@ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - " pushl %ebp\n" \ + SAVE_RBP_STRING \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ -- cgit v1.2.3 From b664d57f39d01e775204d4f1a7e2f8bda77bc549 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Oct 2017 16:18:02 +0900 Subject: kprobes/x86: Remove IRQ disabling from jprobe handlers Jprobes actually don't need to disable IRQs while calling handlers, because of how we specify the kernel interface in Documentation/kprobes.txt: ----- Probe handlers are run with preemption disabled. Depending on the architecture and optimization state, handlers may also run with interrupts disabled (e.g., kretprobe handlers and optimized kprobe handlers run without interrupt disabled on x86/x86-64). ----- So let's remove IRQ disabling from jprobes too. Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Alexei Starovoitov Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150701508194.32266.14458959863314097305.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f0153714ddac..0742491cbb73 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1080,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * raw stack chunk with redzones: */ __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->flags &= ~X86_EFLAGS_IF; - trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); /* -- cgit v1.2.3 From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Oct 2017 15:37:20 -0700 Subject: bpf: fix bpf_tail_call() x64 JIT - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 4 ++-- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c9573660d51..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..f90860d1f897 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,7 +312,7 @@ union bpf_attr { * jump into another BPF program * @ctx: context pointer passed to next program * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run + * @index: 32-bit index inside array that selects specific program to run * Return: 0 on success or negative error * * int bpf_clone_redirect(skb, ifindex, flags) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; -- cgit v1.2.3 From a2b7861bb33b2538420bb5d8554153484d3f961f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 3 Oct 2017 21:36:51 +0800 Subject: kvm/x86: Avoid async PF preempting the kernel incorrectly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, in PREEMPT_COUNT=n kernel, kvm_async_pf_task_wait() could call schedule() to reschedule in some cases. This could result in accidentally ending the current RCU read-side critical section early, causing random memory corruption in the guest, or otherwise preempting the currently running task inside between preempt_disable and preempt_enable. The difficulty to handle this well is because we don't know whether an async PF delivered in a preemptible section or RCU read-side critical section for PREEMPT_COUNT=n, since preempt_disable()/enable() and rcu_read_lock/unlock() are both no-ops in that case. To cure this, we treat any async PF interrupting a kernel context as one that cannot be preempted, preventing kvm_async_pf_task_wait() from choosing the schedule() path in that case. To do so, a second parameter for kvm_async_pf_task_wait() is introduced, so that we know whether it's called from a context interrupting the kernel, and the parameter is set properly in all the callsites. Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: Wanpeng Li Cc: stable@vger.kernel.org Signed-off-by: Boqun Feng Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/kernel/kvm.c | 14 ++++++++++---- arch/x86/kvm/mmu.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index bc62e7cbf1b1..59ad3d132353 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e675704fa6f7..8bb9594d0761 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1 || - rcu_preempt_depth(); + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca30c1eb1d9..106d4a029a8a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, case KVM_PV_REASON_PAGE_NOT_PRESENT: vcpu->arch.apf.host_apf_reason = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address); + kvm_async_pf_task_wait(fault_address, 0); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: -- cgit v1.2.3 From 262e681183ddcdb24d64a2f993e41a226adcec29 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 2 Oct 2017 11:28:36 +0200 Subject: x86/mce: Hide mca_cfg Now that lguest is gone, put it in the internal header which should be used only by MCA/RAS code. Add missing header guards while at it. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171002092836.22971-3-bp@alien8.de --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mcheck/mce-internal.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 181264989db5..8edac1de2e35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -187,7 +187,6 @@ struct mca_msr_regs { extern struct mce_vendor_flags mce_flags; -extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 098530a93bb7..debb974fd17d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,6 @@ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + #include #include @@ -108,3 +111,7 @@ static inline void mce_work_trigger(void) { } static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif + +extern struct mca_config mca_cfg; + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 40e28ed77fbf..486f640b02ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -28,6 +28,8 @@ #include #include +#include "mce-internal.h" + #define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 -- cgit v1.2.3 From f26e60167d8b5b1c67b3efd4cb5672da446bdb0e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 4 Oct 2017 10:39:05 -0500 Subject: x86/kvm: Move kvm_fastop_exception to .fixup section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling the kernel with the '-frecord-gcc-switches' flag, objtool complains: arch/x86/kvm/emulate.o: warning: objtool: .GCC.command.line+0x0: special: can't find new instruction And also the kernel fails to link. The problem is that the 'kvm_fastop_exception' code gets placed into the throwaway '.GCC.command.line' section instead of '.text'. Exception fixup code is conventionally placed in the '.fixup' section, so put it there where it belongs. Reported-and-tested-by: Guenter Roeck Signed-off-by: Josh Poimboeuf Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a36254cbf776..d90cdc77e077 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #op " %al \n\t" \ FOP_RET -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +asm(".pushsection .fixup, \"ax\"\n" + ".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret\n" + ".popsection"); FOP_START(setcc) FOP_SETCC(seto) -- cgit v1.2.3 From e42eef4ba38806b18c4a74f0c276fb2e0b548173 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Oct 2017 12:28:18 +0200 Subject: KVM: add X86_LOCAL_APIC dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rework of the posted interrupt handling broke building without support for the local APIC: ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined! That configuration is probably not particularly useful anyway, so we can avoid the randconfig failures by adding a Kconfig dependency. Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts") Signed-off-by: Arnd Bergmann Signed-off-by: Radim Krčmář --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3ea624452f93..3c48bc8bf08c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: depends on NET && MULTIUSER + depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES -- cgit v1.2.3 From 924c6b900cfdf376b07bccfd80e62b21914f8a5a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 8 Oct 2017 21:53:05 -0700 Subject: x86/mm/64: Fix reboot interaction with CR4.PCIDE Trying to reboot via real mode fails with PCID on: long mode cannot be exited while CR4.PCIDE is set. (No, I have no idea why, but the SDM and actual CPUs are in agreement here.) The result is a GPF and a hang instead of a reboot. I didn't catch this in testing because neither my computer nor my VM reboots this way. I can trigger it with reboot=bios, though. Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems") Reported-and-tested-by: Steven Rostedt (VMware) Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org --- arch/x86/kernel/reboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 54180fa6f66f..add33f600531 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -105,6 +105,10 @@ void __noreturn machine_real_restart(unsigned int type) load_cr3(initial_page_table); #else write_cr3(real_mode_header->trampoline_pgd); + + /* Exiting long mode will fail if CR4.PCIDE is set. */ + if (static_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); #endif /* Jump to the identity-mapped low memory code */ -- cgit v1.2.3 From 6b32c126d33d5cb379bca280ab8acedc1ca978ff Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 Oct 2017 20:30:12 +0200 Subject: x86/alternatives: Fix alt_max_short macro to really be a max() The alt_max_short() macro in asm/alternative.h does not work as intended, leading to nasty bugs. E.g. alt_max_short("1", "3") evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not exactly the maximum of 1 and 3. In fact, I had to learn it the hard way by crashing my kernel in not so funny ways by attempting to make use of the ALTENATIVE_2 macro with alternatives where the first one was larger than the second one. According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") the right handed side should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the macro work as intended. While at it, fix up the comments regarding the additional "-", too. It's not about gas' usage of s32 but brain dead logic of having a "true" value of -1 for the < operator ... *sigh* Btw., the one in asm/alternative-asm.h is correct. And, apparently, all current users of ALTERNATIVE_2() pass same sized alternatives, avoiding to hit the bug. [1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax Reviewed-and-tested-by: Borislav Petkov Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") Signed-off-by: Mathias Krause Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com --- arch/x86/include/asm/alternative-asm.h | 4 +++- arch/x86/include/asm/alternative.h | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index c096624137ae..ccbe24e697c4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is -- cgit v1.2.3 From 62dd86ac01f9fb6386d7f8c6b389c3ea4582a50a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:02 -0500 Subject: x86/unwind: Fix dereference of untrusted pointer Tetsuo Handa and Fengguang Wu reported a panic in the unwinder: BUG: unable to handle kernel NULL pointer dereference at 000001f2 IP: update_stack_state+0xd4/0x340 *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP CPU: 0 PID: 18728 Comm: 01-cpu-hotplug Not tainted 4.13.0-rc4-00170-gb09be67 #592 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014 task: bb0b53c0 task.stack: bb3ac000 EIP: update_stack_state+0xd4/0x340 EFLAGS: 00010002 CPU: 0 EAX: 0000a570 EBX: bb3adccb ECX: 0000f401 EDX: 0000a570 ESI: 00000001 EDI: 000001ba EBP: bb3adc6b ESP: bb3adc3f DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 000001f2 CR3: 0b3a7000 CR4: 00140690 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: fffe0ff0 DR7: 00000400 Call Trace: ? unwind_next_frame+0xea/0x400 ? __unwind_start+0xf5/0x180 ? __save_stack_trace+0x81/0x160 ? save_stack_trace+0x20/0x30 ? __lock_acquire+0xfa5/0x12f0 ? lock_acquire+0x1c2/0x230 ? tick_periodic+0x3a/0xf0 ? _raw_spin_lock+0x42/0x50 ? tick_periodic+0x3a/0xf0 ? tick_periodic+0x3a/0xf0 ? debug_smp_processor_id+0x12/0x20 ? tick_handle_periodic+0x23/0xc0 ? local_apic_timer_interrupt+0x63/0x70 ? smp_trace_apic_timer_interrupt+0x235/0x6a0 ? trace_apic_timer_interrupt+0x37/0x3c ? strrchr+0x23/0x50 Code: 0f 95 c1 89 c7 89 45 e4 0f b6 c1 89 c6 89 45 dc 8b 04 85 98 cb 74 bc 88 4d e3 89 45 f0 83 c0 01 84 c9 89 04 b5 98 cb 74 bc 74 3b <8b> 47 38 8b 57 34 c6 43 1d 01 25 00 00 02 00 83 e2 03 09 d0 83 EIP: update_stack_state+0xd4/0x340 SS:ESP: 0068:bb3adc3f CR2: 00000000000001f2 ---[ end trace 0d147fd4aba8ff50 ]--- Kernel panic - not syncing: Fatal exception in interrupt On x86-32, after decoding a frame pointer to get a regs address, regs_size() dereferences the regs pointer when it checks regs->cs to see if the regs are user mode. This is dangerous because it's possible that what looks like a decoded frame pointer is actually a corrupt value, and we don't want the unwinder to make things worse. Instead of calling regs_size() on an unsafe pointer, just assume they're kernel regs to start with. Later, once it's safe to access the regs, we can do the user mode check and corresponding safety check for the remaining two regs. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5ed8d8bb38c5 ("x86/unwind: Move common code into update_stack_state()") Link: http://lkml.kernel.org/r/7f95b9a6993dec7674b3f3ab3dcd3294f7b9644d.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d145a0b1f529..d05637726c10 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -184,6 +184,12 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#ifdef CONFIG_X86_32 +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) +#else +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) +#endif + static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -202,7 +208,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = regs_size(regs); + len = KERNEL_REGS_SIZE; state->got_irq = true; } else { frame = next_bp; @@ -226,6 +232,14 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; + /* + * On 32-bit with user mode regs, make sure the last two regs are safe + * to access: + */ + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && + !on_stack(info, frame, len + 2*sizeof(long))) + return false; + /* Move state to the next frame: */ if (regs) { state->regs = regs; -- cgit v1.2.3 From 5c99b692cfd62f6915ed7ee7ed3c38d65ba3feab Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:03 -0500 Subject: x86/unwind: Use MSB for frame pointer encoding on 32-bit On x86-32, Tetsuo Handa and Fengguang Wu reported unwinder warnings like: WARNING: kernel stack regs at f60bb9c8 in swapper:1 has bad 'bp' value 0ba00000 And also there were some stack dumps with a bunch of unreliable '?' symbols after an apic_timer_interrupt symbol, meaning the unwinder got confused when it tried to read the regs. The cause of those issues is that, with GCC 4.8 (and possibly older), there are cases where GCC misaligns the stack pointer in a leaf function for no apparent reason: c124a388 : c124a388: 55 push %ebp c124a389: 89 e5 mov %esp,%ebp c124a38b: 57 push %edi c124a38c: 56 push %esi c124a38d: 89 d6 mov %edx,%esi c124a38f: 53 push %ebx c124a390: 31 db xor %ebx,%ebx c124a392: 83 ec 03 sub $0x3,%esp ... c124a3e3: 83 c4 03 add $0x3,%esp c124a3e6: 5b pop %ebx c124a3e7: 5e pop %esi c124a3e8: 5f pop %edi c124a3e9: 5d pop %ebp c124a3ea: c3 ret If an interrupt occurs in such a function, the regs on the stack will be unaligned, which breaks the frame pointer encoding assumption. So on 32-bit, use the MSB instead of the LSB to encode the regs. This isn't an issue on 64-bit, because interrupts align the stack before writing to it. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/279a26996a482ca716605c7dbc7f2db9d8d91e81.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/kernel/unwind_frame.c | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 8a13d468635a..50e0d2bc4528 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -176,7 +176,7 @@ /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also + * is just clearing the MSB, which makes it an invalid stack address and is also * a signal to the unwinder that it's a pt_regs pointer in disguise. * * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the @@ -185,7 +185,7 @@ .macro ENCODE_FRAME_POINTER #ifdef CONFIG_FRAME_POINTER mov %esp, %ebp - orl $0x1, %ebp + andl $0x7fffffff, %ebp #endif .endm diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d05637726c10..4949bbc95f75 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -174,6 +174,7 @@ static bool is_last_task_frame(struct unwind_state *state) * This determines if the frame pointer actually contains an encoded pointer to * pt_regs on the stack. See ENCODE_FRAME_POINTER. */ +#ifdef CONFIG_X86_64 static struct pt_regs *decode_frame_pointer(unsigned long *bp) { unsigned long regs = (unsigned long)bp; @@ -183,6 +184,17 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#else +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (regs & 0x80000000) + return NULL; + + return (struct pt_regs *)(regs | 0x80000000); +} +#endif #ifdef CONFIG_X86_32 #define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) -- cgit v1.2.3 From 99bd28a49b150e4b938313a63b5532d95ba77885 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:04 -0500 Subject: x86/unwind: Align stack pointer in unwinder dump When printing the unwinder dump, the stack pointer could be unaligned, for one of two reasons: - stack corruption; or - GCC created an unaligned stack. There's no way for the unwinder to tell the difference between the two, so we have to assume one or the other. GCC unaligned stacks are very rare, and have only been spotted before GCC 5. Presumably, if we're doing an unwinder stack dump, stack corruption is more likely than a GCC unaligned stack. So always align the stack before starting the dump. Reported-and-tested-by: Tetsuo Handa Reported-and-tested-by: Fengguang Wu Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2f540c515946ab09ed267e1a1d6421202a0cce08.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 4949bbc95f75..81aca077fbb6 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -44,7 +44,8 @@ static void unwind_dump(struct unwind_state *state) state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) break; -- cgit v1.2.3 From d4a2d031dd42f9594107d317e2a9a0c6d73ad46b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Oct 2017 20:20:05 -0500 Subject: x86/unwind: Disable unwinder warnings on 32-bit x86-32 doesn't have stack validation, so in most cases it doesn't make sense to warn about bad frame pointers. Reported-by: Tetsuo Handa Signed-off-by: Josh Poimboeuf Cc: Byungchul Park Cc: Fengguang Wu Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a69658760800bf281e6353248c23e0fa0acf5230.1507597785.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 81aca077fbb6..3dc26f95d46e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -355,6 +355,13 @@ bad_address: state->regs->sp < (unsigned long)task_pt_regs(state->task)) goto the_end; + /* + * There are some known frame pointer issues on 32-bit. Disable + * unwinder warnings on 32-bit until it gets objtool support. + */ + if (IS_ENABLED(CONFIG_X86_32)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", -- cgit v1.2.3 From 629eb703d3e46aa570c6c91235d38fd09ed8c58b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 9 Oct 2017 18:26:55 +0100 Subject: perf/x86/intel/uncore: Fix memory leaks on allocation failures Currently if an allocation fails then the error return paths don't free up any currently allocated pmus[].boxes and pmus causing a memory leak. Add an error clean up exit path that frees these objects. Detected by CoverityScan, CID#711632 ("Resource Leak") Signed-off-by: Colin Ian King Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Fixes: 087bfbb03269 ("perf/x86: Add generic Intel uncore PMU support") Link: http://lkml.kernel.org/r/20171009172655.6132-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 1c5390f1cf09..d45e06346f14 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); if (!pmus[i].boxes) - return -ENOMEM; + goto err; } type->pmus = pmus; @@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + sizeof(*attr_group), GFP_KERNEL); if (!attr_group) - return -ENOMEM; + goto err; attrs = (struct attribute **)(attr_group + 1); attr_group->name = "events"; @@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } type->pmu_group = &uncore_pmu_attr_group; + return 0; + +err: + for (i = 0; i < type->num_boxes; i++) + kfree(pmus[i].boxes); + kfree(pmus); + + return -ENOMEM; } static int __init -- cgit v1.2.3 From a3b7424392924e778b608e30ee321f7b10cc94b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Oct 2017 17:48:54 +0200 Subject: x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs hv_flush_pcpu_ex structures are not cleared between calls for performance reasons (they're variable size up to PAGE_SIZE each) but we must clear hv_vp_set.bank_contents part of it to avoid flushing unneeded vCPUs. The rest of the structure is formed correctly. To do the clearing in an efficient way stash the maximum possible vCPU number (this may differ from Linux CPU id). Reported-by: Jork Loeser Signed-off-by: Vitaly Kuznetsov Cc: Dexuan Cui Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/20171006154854.18092-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_init.c | 5 +++++ arch/x86/hyperv/mmu.c | 17 ++++++++++++----- arch/x86/include/asm/mshyperv.h | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a8eb550c40f..a5db63f728a2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); +u32 hv_max_vp_index; + static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu) hv_vp_index[smp_processor_id()] = msr_vp_index; + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + return 0; } diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 39e7f6e50919..9502d04c0c95 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, { int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + /* * Some banks may end up being empty but this is acceptable. */ @@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, vcpu = hv_cpu_number_to_vp_number(cpu); vcpu_bank = vcpu / 64; vcpu_offset = vcpu % 64; - - /* valid_bank_mask can represent up to 64 banks */ - if (vcpu_bank >= 64) - return 0; - __set_bit(vcpu_offset, (unsigned long *) &flush->hv_vp_set.bank_contents[vcpu_bank]); if (vcpu_bank >= nr_bank) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 738503e1f80c..530f448fddaf 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, * to this information. */ extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; /** * hv_cpu_number_to_vp_number() - Map CPU to VP. -- cgit v1.2.3 From 60d73a7c96601434dfdb56d5b9167ff3b850d8d7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 5 Oct 2017 13:39:24 +0200 Subject: x86/hyperv: Don't use percpu areas for pcpu_flush/pcpu_flush_ex structures hv_do_hypercall() does virt_to_phys() translation and with some configs (CONFIG_SLAB) this doesn't work for percpu areas, we pass wrong memory to hypervisor and get #GP. We could use working slow_virt_to_phys() instead but doing so kills the performance. Move pcpu_flush/pcpu_flush_ex structures out of percpu areas and allocate memory on first call. The additional level of indirection gives us a small performance penalty, in future we may consider introducing hypercall functions which avoid virt_to_phys() conversion and cache physical addresses of pcpu_flush/pcpu_flush_ex structures somewhere. Reported-by: Simon Xiao Signed-off-by: Vitaly Kuznetsov Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Jork Loeser Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/20171005113924.28021-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/mmu.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 9502d04c0c95..f21cebbb5f6c 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -36,9 +36,9 @@ struct hv_flush_pcpu_ex { /* Each gva in gva_list encodes up to 4096 pages to flush */ #define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) -static struct hv_flush_pcpu __percpu *pcpu_flush; +static struct hv_flush_pcpu __percpu **pcpu_flush; -static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; /* * Fills in gva_list starting from offset. Returns the number of items added. @@ -109,6 +109,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; struct hv_flush_pcpu *flush; u64 status = U64_MAX; unsigned long flags; @@ -123,7 +124,17 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush); + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -180,6 +191,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; struct hv_flush_pcpu_ex *flush; u64 status = U64_MAX; unsigned long flags; @@ -194,7 +206,17 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, local_irq_save(flags); - flush = this_cpu_ptr(pcpu_flush_ex); + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } if (info->mm) { flush->address_space = virt_to_phys(info->mm->pgd); @@ -273,7 +295,7 @@ void hyper_alloc_mmu(void) return; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); else - pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); } -- cgit v1.2.3 From ab7ff471aa5db670197070760f022622793da7e5 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Cerri Date: Thu, 5 Oct 2017 10:34:29 -0300 Subject: x86/hyperv: Fix hypercalls with extended CPU ranges for TLB flushing Do not consider the fixed size of hv_vp_set when passing the variable header size to hv_do_rep_hypercall(). The Hyper-V hypervisor specification states that for a hypercall with a variable header only the size of the variable portion should be supplied via the input control. For HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX/LIST_EX calls that means the fixed portion of hv_vp_set should not be considered. That fixes random failures of some applications that are unexpectedly killed with SIGBUS or SIGSEGV. Signed-off-by: Marcelo Henrique Cerri Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Jork Loeser Cc: Josh Poulson Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Simon Xiao Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: devel@linuxdriverproject.org Fixes: 628f54cc6451 ("x86/hyper-v: Support extended CPU ranges for TLB flush hypercalls") Link: http://lkml.kernel.org/r/1507210469-29065-1-git-send-email-marcelo.cerri@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index f21cebbb5f6c..9cc9e1c1e2db 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -251,18 +251,18 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else if (info->end && ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, - 0, nr_bank + 2, flush, NULL); + 0, nr_bank, flush, NULL); } else { gva_n = fill_gva_list(flush->gva_list, nr_bank, info->start, info->end); status = hv_do_rep_hypercall( HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, - gva_n, nr_bank + 2, flush, NULL); + gva_n, nr_bank, flush, NULL); } local_irq_restore(flags); -- cgit v1.2.3 From eac779aa509d453a55da0ea4302bdb79c4e0854f Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sun, 8 Oct 2017 19:58:46 -0700 Subject: xen/vcpu: Use a unified name about cpu hotplug state for pv and pvhvm As xen_cpuhp_setup is called by PV and PVHVM, the name of "x86/xen/hvm_guest" is confusing. Signed-off-by: Zhenzhong Duan Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0e7ef69e8531..d669e9d89001 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", + "x86/xen/guest:prepare", cpu_up_prepare_cb, cpu_dead_cb); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", + "x86/xen/guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); -- cgit v1.2.3 From fd19d3b45164466a4adce7cbff448ba9189e1427 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 5 Oct 2017 11:10:22 +0200 Subject: KVM: nVMX: update last_nonleaf_level when initializing nested EPT The function updates context->root_level but didn't call update_last_nonleaf_level so the previous and potentially wrong value was used for page walks. For example, a zero value of last_nonleaf_level would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's walk_addr_generic function (CVE-2017-12188). Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 106d4a029a8a..3c25f20115bc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4555,6 +4555,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } -- cgit v1.2.3 From 829ee279aed43faa5cb1e4d65c0cad52f2426c53 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 5 Oct 2017 11:10:23 +0200 Subject: KVM: MMU: always terminate page walks at level 1 is_last_gpte() is not equivalent to the pseudo-code given in commit 6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect value of last_nonleaf_level may override the result even if level == 1. It is critical for is_last_gpte() to return true on level == 1 to terminate page walks. Otherwise memory corruption may occur as level is used as an index to various data structures throughout the page walking code. Even though the actual bug would be wherever the MMU is initialized (as in the previous patch), be defensive and ensure here that is_last_gpte() returns the correct value. This patch is also enough to fix CVE-2017-12188. Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2 Cc: stable@vger.kernel.org Cc: Andy Honig Signed-off-by: Ladi Prosek [Panic if walk_addr_generic gets an incorrect level; this is a serious bug and it's not worth a WARN_ON where the recovery path might hide further exploitable issues; suggested by Andrew Honig. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3c25f20115bc..7a69cf053711 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3973,13 +3973,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { - /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear @@ -3987,6 +3980,13 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 86b68dc5a649..f18d1f8d332b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -334,10 +334,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; -- cgit v1.2.3 From 67bb8e999e0aeac285d22f0e53c856b9df5282c6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 10 Oct 2017 14:45:04 -0500 Subject: x86/mm: Disable various instrumentations of mm/mem_encrypt.c and mm/tlb.c Some routines in mem_encrypt.c are called very early in the boot process, e.g. sme_enable(). When CONFIG_KCOV=y is defined the resulting code added to sme_enable() (and others) for KCOV instrumentation results in a kernel crash. Disable the KCOV instrumentation for mem_encrypt.c by adding KCOV_INSTRUMENT_mem_encrypt.o := n to arch/x86/mm/Makefile. In order to avoid other possible early boot issues, model mem_encrypt.c after head64.c in regards to tools. In addition to disabling KCOV as stated above and a previous patch that disables branch profiling, also remove the "-pg" CFLAG if CONFIG_FUNCTION_TRACER is enabled and set KASAN_SANITIZE to "n", each of which are done on a file basis. Reported-by: kernel test robot Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171010194504.18887.38053.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 72bf8c01c6e3..e1f095884386 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,12 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o := n +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n + +KASAN_SANITIZE_mem_encrypt.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o setup_nx.o tlb.o -- cgit v1.2.3 From 8eb3f87d903168bdbd1222776a6b1e281f50513e Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 10 Oct 2017 15:01:22 +0800 Subject: KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit When KVM emulates an exit from L2 to L1, it loads L1 CR4 into the guest CR4. Before this CR4 loading, the guest CR4 refers to L2 CR4. Because these two CR4's are in different levels of guest, we should vmx_set_cr4() rather than kvm_set_cr4() here. The latter, which is used to handle guest writes to its CR4, checks the guest change to CR4 and may fail if the change is invalid. The failure may cause trouble. Consider we start a L1 guest with non-zero L1 PCID in use, (i.e. L1 CR4.PCIDE == 1 && L1 CR3.PCID != 0) and a L2 guest with L2 PCID disabled, (i.e. L2 CR4.PCIDE == 0) and following events may happen: 1. If kvm_set_cr4() is used in load_vmcs12_host_state() to load L1 CR4 into guest CR4 (in VMCS01) for L2 to L1 exit, it will fail because of PCID check. As a result, the guest CR4 recorded in L0 KVM (i.e. vcpu->arch.cr4) is left to the value of L2 CR4. 2. Later, if L1 attempts to change its CR4, e.g., clearing VMXE bit, kvm_set_cr4() in L0 KVM will think L1 also wants to enable PCID, because the wrong L2 CR4 is used by L0 KVM as L1 CR4. As L1 CR3.PCID != 0, L0 KVM will inject GP to L1 guest. Fixes: 4704d0befb072 ("KVM: nVMX: Exiting from L2 to L1") Cc: qemu-stable@nongnu.org Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a2b804e10c95..95a01609d7ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11297,7 +11297,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); -- cgit v1.2.3 From cc6afe2240298049585e86b1ade85efc8a7f225d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Oct 2017 12:12:57 +0200 Subject: x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on hypervisors Commit 594a30fb1242 ("x86/apic: Silence "FW_BUG TSC_DEADLINE disabled due to Errata" on CPUs without the feature", 2017-08-30) was also about silencing the warning on VirtualBox; however, KVM does expose the TSC deadline timer, and it's virtualized so that it is immune from CPU errata. Therefore, booting 4.13 with "-cpu Haswell" shows this in the logs: [ 0.000000] [Firmware Bug]: TSC_DEADLINE disabled due to Errata; please update microcode to version: 0xb2 (or later) Even if you had a hypervisor that does _not_ virtualize the TSC deadline and rather exposes the hardware one, it should be the hypervisors task to update microcode and possibly hide the flag from CPUID. So just hide the message when running on _any_ hypervisor, not just those that do not support the TSC deadline timer. The older check still makes sense, so keep it. Fixes: bd9240a18e ("x86/apic: Add TSC_DEADLINE quirk due to errata") Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Hans de Goede Cc: kvm@vger.kernel.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1507630377-54471-1-git-send-email-pbonzini@redhat.com --- arch/x86/kernel/apic/apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d705c769f77d..50109eae8cd7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -600,7 +600,8 @@ static void apic_check_deadline_errata(void) const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || + boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; m = x86_match_cpu(deadline_match); -- cgit v1.2.3 From 616dd5872e52493863b0202632703eebd51243dc Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 11 Oct 2017 17:16:04 -0400 Subject: x86/apic: Update TSC_DEADLINE quirk with additional SKX stepping SKX stepping-3 fixed the TSC_DEADLINE issue in a different ucode version number than stepping-4. Linux needs to know this stepping-3 specific version number to also enable the TSC_DEADLINE on stepping-3. The steppings and ucode versions are documented in the SKX BIOS update: https://downloadmirror.intel.com/26978/eng/ReleaseNotes_R00.01.0004.txt Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/60f2bbf7cf617e212b522e663f84225bfebc50e5.1507756305.git.len.brown@intel.com --- arch/x86/kernel/apic/apic.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50109eae8cd7..ff891772c9f8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,11 +573,21 @@ static u32 bdx_deadline_rev(void) return ~0U; } +static u32 skx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x03: return 0x01000136; + case 0x04: return 0x02000014; + } + + return ~0U; +} + static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), -- cgit v1.2.3 From b956575bed91ecfb136a8300742ecbbf451471ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 9 Oct 2017 09:50:49 -0700 Subject: x86/mm: Flush more aggressively in lazy TLB mode Since commit: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") x86's lazy TLB mode has been all the way lazy: when running a kernel thread (including the idle thread), the kernel keeps using the last user mm's page tables without attempting to maintain user TLB coherence at all. From a pure semantic perspective, this is fine -- kernel threads won't attempt to access user pages, so having stale TLB entries doesn't matter. Unfortunately, I forgot about a subtlety. By skipping TLB flushes, we also allow any paging-structure caches that may exist on the CPU to become incoherent. This means that we can have a paging-structure cache entry that references a freed page table, and the CPU is within its rights to do a speculative page walk starting at the freed page table. I can imagine this causing two different problems: - A speculative page walk starting from a bogus page table could read IO addresses. I haven't seen any reports of this causing problems. - A speculative page walk that involves a bogus page table can install garbage in the TLB. Such garbage would always be at a user VA, but some AMD CPUs have logic that triggers a machine check when it notices these bogus entries. I've seen a couple reports of this. Boris further explains the failure mode: > It is actually more of an optimization which assumes that paging-structure > entries are in WB DRAM: > > "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables > performance optimization that assumes PML4, PDP, PDE, and PTE entries > are in cacheable WB-DRAM; memory type checks may be bypassed, and > addresses outside of WB-DRAM may result in undefined behavior or NB > protocol errors. 1=Disables performance optimization and allows PML4, > PDP, PDE and PTE entries to be in any memory type. Operating systems > that maintain page tables in memory types other than WB- DRAM must set > TlbCacheDis to insure proper operation." > > The MCE generated is an NB protocol error to signal that > > "Link: A specific coherent-only packet from a CPU was issued to an > IO link. This may be caused by software which addresses page table > structures in a memory type other than cacheable WB-DRAM without > properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for > example, when page table structure addresses are above top of memory. In > such cases, the NB will generate an MCE if it sees a mismatch between > the memory operation generated by the core and the link type." > > I'm assuming coherent-only packets don't go out on IO links, thus the > error. To fix this, reinstate TLB coherence in lazy mode. With this patch applied, we do it in one of two ways: - If we have PCID, we simply switch back to init_mm's page tables when we enter a kernel thread -- this seems to be quite cheap except for the cost of serializing the CPU. - If we don't have PCID, then we set a flag and switch to init_mm the first time we would otherwise need to flush the TLB. The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed to override the default mode for benchmarking. In theory, we could optimize this better by only flushing the TLB in lazy CPUs when a page table is freed. Doing that would require auditing the mm code to make sure that all page table freeing goes through tlb_remove_page() as well as reworking some data structures to implement the improved flush logic. Reported-by: Markus Trippelsdorf Reported-by: Adam Borowski Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Borkmann Cc: Eric Biggers Cc: Johannes Hirte Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Roman Kagan Cc: Thomas Gleixner Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 8 +- arch/x86/include/asm/tlbflush.h | 24 ++++++ arch/x86/mm/tlb.c | 153 +++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c120b5db178a..3c856a15b98e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_clear_cpu(cpu, mm_cpumask(mm)); -} +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4893abf7f74f..d362161d3291 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If + * it's false, then we immediately switch CR3 when entering a kernel thread. + */ +DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + /* * 6 because 6 should be plenty and struct tlb_state will fit in * two cache lines. @@ -104,6 +111,23 @@ struct tlb_state { u16 loaded_mm_asid; u16 next_asid; + /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49d9778376d7..658bf0090565 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,6 +30,8 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) { @@ -80,7 +82,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif + this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * There's nothing to do: we weren't lazy, and we - * aren't changing our mm. We don't need to flush - * anything, nor do we need to update CR3, CR4, or - * LDTR. - */ - return; - } - - /* Resume remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < - next_tlb_gen) { - /* - * Ideally, we'd have a flush_tlb() variant that - * takes the known CR3 value as input. This would - * be faster on Xen PV and on hypothetical CPUs - * on which INVPCID is fast. - */ - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); - write_cr3(build_cr3(next, prev_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); - } - /* - * We just exited lazy mode, which means that CR4 and/or LDTR - * may be stale. (Changes to the required CR4 and LDTR states - * are not reflected in tlb_gen.) + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; } else { u16 new_asid; bool need_flush; @@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* Stop remote flushes for the previous mm */ - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. @@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_ldt(real_prev, next); } +/* + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (static_branch_unlikely(&tlb_use_lazy_mode)) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} + /* * Call this when reinitializing a CPU. It fixes the following potential * problems: @@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (unlikely(loaded_mm == &init_mm)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + if (this_cpu_read(cpu_tlbstate.is_lazy)) { /* - * We're in lazy mode -- don't flush. We can get here on - * remote flushes due to races and on local flushes if a - * kernel thread coincidentally flushes the mm it's lazily - * still using. + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. */ + switch_mm_irqs_off(NULL, &init_mm, NULL); return; } @@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[2]; + + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; + buf[1] = '\n'; + + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t tlblazy_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + bool val; + + if (kstrtobool_from_user(user_buf, count, &val)) + return -EINVAL; + + if (val) + static_branch_enable(&tlb_use_lazy_mode); + else + static_branch_disable(&tlb_use_lazy_mode); + + return count; +} + +static const struct file_operations fops_tlblazy = { + .read = tlblazy_read_file, + .write = tlblazy_write_file, + .llseek = default_llseek, +}; + +static int __init init_tlb_use_lazy_mode(void) +{ + if (boot_cpu_has(X86_FEATURE_PCID)) { + /* + * Heuristic: with PCID on, switching to and from + * init_mm is reasonably fast, but remote flush IPIs + * as expensive as ever, so turn off lazy TLB mode. + * + * We can't do this in setup_pcid() because static keys + * haven't been initialized yet, and it would blow up + * badly. + */ + static_branch_disable(&tlb_use_lazy_mode); + } + + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlblazy); + return 0; +} +late_initcall(init_tlb_use_lazy_mode); -- cgit v1.2.3 From 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 12 Oct 2017 13:23:16 +0200 Subject: x86/microcode: Do the family check first On CPUs like AMD's Geode, for example, we shouldn't even try to load microcode because they do not support the modern microcode loading interface. However, we do the family check *after* the other checks whether the loader has been disabled on the command line or whether we're running in a guest. So move the family checks first in order to exit early if we're being loaded on an unsupported family. Reported-and-tested-by: Sven Glodowski Signed-off-by: Borislav Petkov Cc: # 4.11.. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396 Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 86e8f0b2537b..c4fa4a85d4cb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but @@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; cpuid_1_eax = native_cpuid_eax(1); switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (x86_family(cpuid_1_eax) >= 6) - load_ucode_intel_bsp(); + if (x86_family(cpuid_1_eax) < 6) + return; break; + case X86_VENDOR_AMD: - if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_bsp(cpuid_1_eax); + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); } static bool check_loader_disabled_ap(void) -- cgit v1.2.3 From e6fc454b7794fc45c27364c7896b8f03094635ee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 15 Oct 2017 17:02:03 +0100 Subject: x86/cpu/intel_cacheinfo: Remove redundant assignment to 'this_leaf' The 'this_leaf' variable is assigned a value that is never read and it is updated a little later with a newer value, hence we can remove the redundant assignment. Cleans up the following Clang warning: Value stored to 'this_leaf' is never read Signed-off-by: Colin Ian King Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20171015160203.12332-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 24f749324c0f..9990a71e311f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -831,7 +831,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - this_leaf = this_cpu_ci->info_list + index; nshared = base->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).apicid; first = apicid - (apicid % nshared); -- cgit v1.2.3 From 9c48c0965b97e14ddcf75490a754e84e05aaa062 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Oct 2017 12:12:16 +0200 Subject: x86/idt: Initialize early IDT before cr4_init_shadow() Moving the early IDT setup out of assembly code breaks the boot on first generation 486 systems. The reason is that the call of idt_setup_early_handler, which sets up the early handlers was added after the call to cr4_init_shadow(). cr4_init_shadow() tries to read CR4 which is not available on those systems. The accessor function uses a extable fixup to handle the resulting fault. As the IDT is not set up yet, the cr4 read exception causes an instantaneous reboot for obvious reasons. Call idt_setup_early_handler() before cr4_init_shadow() so IDT is set up before the first exception hits. Fixes: 87e81786b13b ("x86/idt: Move early IDT setup out of 32-bit asm") Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710161210290.1973@nanos --- arch/x86/kernel/head32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index cf2ce063f65a..2902ca4d5993 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -30,10 +30,11 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { - cr4_init_shadow(); - + /* Make sure IDT is set up before any exception happens */ idt_setup_early_handler(); + cr4_init_shadow(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); -- cgit v1.2.3 From 723f2828a98c8ca19842042f418fb30dd8cfc0f7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Oct 2017 13:12:25 +0200 Subject: x86/microcode/intel: Disable late loading on model 79 Blacklist Broadwell X model 79 for late loading due to an erratum. Signed-off-by: Borislav Petkov Acked-by: Tony Luck Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171018111225.25635-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8f7a9bbad514..7dbcb7adf797 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -918,6 +919,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { + pr_err_once("late loading on model 79 is disabled.\n"); + return true; + } + + return false; +} + static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { @@ -926,6 +939,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, const struct firmware *firmware; enum ucode_state ret; + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); @@ -950,6 +966,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -- cgit v1.2.3 From e8b9b0cc8269c85d8167aaee024bfcbb4976c031 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 14 Oct 2017 09:59:49 -0700 Subject: x86/mm/64: Remove the last VM_BUG_ON() from the TLB code Let's avoid hard-to-diagnose crashes in the future. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f423bbc97864089fbdeb813f1ea126c6eaed844a.1508000261.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 658bf0090565..7db23f9f804e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -147,8 +147,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != - next->context.ctx_id); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); /* * We don't currently support having a real mm loaded without -- cgit v1.2.3 From 4e57b94664fef55aa71cac33b4632fdfdd52b695 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 14 Oct 2017 09:59:50 -0700 Subject: x86/mm: Tidy up "x86/mm: Flush more aggressively in lazy TLB mode" Due to timezones, commit: b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode") was an outdated patch that well tested and fixed the bug but didn't address Borislav's review comments. Tidy it up: - The name "tlb_use_lazy_mode()" was highly confusing. Change it to "tlb_defer_switch_to_init_mm()", which describes what it actually means. - Move the static_branch crap into a helper. - Improve comments. Actually removing the debugfs option is in the next patch. Reported-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode") Link: http://lkml.kernel.org/r/154ef95428d4592596b6e98b0af1d2747d6cfbf8.1508000261.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 7 ++++++- arch/x86/mm/tlb.c | 30 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d362161d3291..0d4a1bb7e303 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -87,7 +87,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) * to init_mm when we switch to a kernel thread (e.g. the idle thread). If * it's false, then we immediately switch CR3 when entering a kernel thread. */ -DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); +DECLARE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm); + +static inline bool tlb_defer_switch_to_init_mm(void) +{ + return static_branch_unlikely(&__tlb_defer_switch_to_init_mm); +} /* * 6 because 6 should be plenty and struct tlb_state will fit in diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 7db23f9f804e..5ee3b59baa85 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,7 +30,7 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); -DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); +DEFINE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm); static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) @@ -213,6 +213,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * * enter_lazy_tlb() is a hint from the scheduler that we are entering a * kernel thread or other context without an mm. Acceptable implementations * include doing nothing whatsoever, switching to init_mm, or various clever @@ -227,7 +230,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (static_branch_unlikely(&tlb_use_lazy_mode)) { + if (tlb_defer_switch_to_init_mm()) { /* * There's a significant optimization that may be possible * here. We have accurate enough TLB flush tracking that we @@ -632,7 +635,8 @@ static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, { char buf[2]; - buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; + buf[0] = static_branch_likely(&__tlb_defer_switch_to_init_mm) + ? '1' : '0'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -647,9 +651,9 @@ static ssize_t tlblazy_write_file(struct file *file, return -EINVAL; if (val) - static_branch_enable(&tlb_use_lazy_mode); + static_branch_enable(&__tlb_defer_switch_to_init_mm); else - static_branch_disable(&tlb_use_lazy_mode); + static_branch_disable(&__tlb_defer_switch_to_init_mm); return count; } @@ -660,23 +664,25 @@ static const struct file_operations fops_tlblazy = { .llseek = default_llseek, }; -static int __init init_tlb_use_lazy_mode(void) +static int __init init_tlblazy(void) { if (boot_cpu_has(X86_FEATURE_PCID)) { /* - * Heuristic: with PCID on, switching to and from - * init_mm is reasonably fast, but remote flush IPIs - * as expensive as ever, so turn off lazy TLB mode. + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we default to trying to defer it in the + * hopes that we can avoid it entirely. The latter approach + * runs the risk of receiving otherwise unnecessary IPIs. * * We can't do this in setup_pcid() because static keys * haven't been initialized yet, and it would blow up * badly. */ - static_branch_disable(&tlb_use_lazy_mode); + static_branch_disable(&__tlb_defer_switch_to_init_mm); } - debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, + debugfs_create_file("tlb_defer_switch_to_init_mm", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_tlblazy); return 0; } -late_initcall(init_tlb_use_lazy_mode); +late_initcall(init_tlblazy); -- cgit v1.2.3 From 7ac7f2c315ef76437f5119df354d334448534fb5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 14 Oct 2017 09:59:51 -0700 Subject: x86/mm: Remove debug/x86/tlb_defer_switch_to_init_mm Borislav thinks that we don't need this knob in a released kernel. Get rid of it. Requested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode") Link: http://lkml.kernel.org/r/1fa72431924e81e86c164ff7881bf9240d1f1a6c.1508000261.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 20 ++++++++------ arch/x86/mm/tlb.c | 58 ----------------------------------------- 2 files changed, 12 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0d4a1bb7e303..c4aed0de565e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,16 +82,20 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif -/* - * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point - * to init_mm when we switch to a kernel thread (e.g. the idle thread). If - * it's false, then we immediately switch CR3 when entering a kernel thread. - */ -DECLARE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm); - static inline bool tlb_defer_switch_to_init_mm(void) { - return static_branch_unlikely(&__tlb_defer_switch_to_init_mm); + /* + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we try to defer it in the hopes that we can + * avoid it entirely. The latter approach runs the risk of + * receiving otherwise unnecessary IPIs. + * + * This choice is just a heuristic. The tlb code can handle this + * function returning true or false regardless of whether we have + * PCID. + */ + return !static_cpu_has(X86_FEATURE_PCID); } /* diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5ee3b59baa85..0f3d0cea4d00 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,7 +30,6 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); -DEFINE_STATIC_KEY_TRUE(__tlb_defer_switch_to_init_mm); static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) @@ -629,60 +628,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); - -static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - char buf[2]; - - buf[0] = static_branch_likely(&__tlb_defer_switch_to_init_mm) - ? '1' : '0'; - buf[1] = '\n'; - - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t tlblazy_write_file(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - bool val; - - if (kstrtobool_from_user(user_buf, count, &val)) - return -EINVAL; - - if (val) - static_branch_enable(&__tlb_defer_switch_to_init_mm); - else - static_branch_disable(&__tlb_defer_switch_to_init_mm); - - return count; -} - -static const struct file_operations fops_tlblazy = { - .read = tlblazy_read_file, - .write = tlblazy_write_file, - .llseek = default_llseek, -}; - -static int __init init_tlblazy(void) -{ - if (boot_cpu_has(X86_FEATURE_PCID)) { - /* - * If we have PCID, then switching to init_mm is reasonably - * fast. If we don't have PCID, then switching to init_mm is - * quite slow, so we default to trying to defer it in the - * hopes that we can avoid it entirely. The latter approach - * runs the risk of receiving otherwise unnecessary IPIs. - * - * We can't do this in setup_pcid() because static keys - * haven't been initialized yet, and it would blow up - * badly. - */ - static_branch_disable(&__tlb_defer_switch_to_init_mm); - } - - debugfs_create_file("tlb_defer_switch_to_init_mm", S_IRUSR | S_IWUSR, - arch_debugfs_dir, NULL, &fops_tlblazy); - return 0; -} -late_initcall(init_tlblazy); -- cgit v1.2.3 From ce56a86e2ade45d052b3228cdfebe913a1ae7381 Mon Sep 17 00:00:00 2001 From: Craig Bergstrom Date: Thu, 19 Oct 2017 13:28:56 -0600 Subject: x86/mm: Limit mmap() of /dev/mem to valid physical addresses Currently, it is possible to mmap() any offset from /dev/mem. If a program mmaps() /dev/mem offsets outside of the addressable limits of a system, the page table can be corrupted by setting reserved bits. For example if you mmap() offset 0x0001000000000000 of /dev/mem on an x86_64 system with a 48-bit bus, the page fault handler will be called with error_code set to RSVD. The kernel then crashes with a page table corruption error. This change prevents this page table corruption on x86 by refusing to mmap offsets higher than the highest valid address in the system. Signed-off-by: Craig Bergstrom Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dsafonov@virtuozzo.com Cc: kirill.shutemov@linux.intel.com Cc: mhocko@suse.com Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/20171019192856.39672-1-craigb@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 4 ++++ arch/x86/mm/mmap.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index c40a95c33bb8..322d25ae23ab 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -110,6 +110,10 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) #endif +#define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern int valid_phys_addr_range(phys_addr_t addr, size_t size); +extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); + /** * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a99679826846..320c6237e1d1 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -174,3 +174,15 @@ const char *arch_vma_name(struct vm_area_struct *vma) return "[mpx]"; return NULL; } + +int valid_phys_addr_range(phys_addr_t addr, size_t count) +{ + return addr + count <= __pa(high_memory); +} + +int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) +{ + phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT; + + return valid_phys_addr_range(addr, count); +} -- cgit v1.2.3 From bfc1168de949cd3e9ca18c3480b5085deff1ea7c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 22 Oct 2017 12:47:31 +0200 Subject: x86/cpu/AMD: Apply the Erratum 688 fix when the BIOS doesn't Some F14h machines have an erratum which, "under a highly specific and detailed set of internal timing conditions" can lead to skipping instructions and RIP corruption. Add the fix for those machines when their BIOS doesn't apply it or there simply isn't BIOS update for them. Tested-by: Signed-off-by: Borislav Petkov Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sherry Hurwitz Cc: Thomas Gleixner Cc: Yazen Ghannam Link: http://lkml.kernel.org/r/20171022104731.28249-1-bp@alien8.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=197285 [ Added pr_info() that we activated the workaround. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 458da8509b75..6db28f17ff28 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -27,6 +27,8 @@ static const struct pci_device_id amd_root_ids[] = { {} }; +#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -37,6 +39,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -48,6 +51,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; @@ -402,11 +406,48 @@ void amd_flush_garts(void) } EXPORT_SYMBOL_GPL(amd_flush_garts); +static void __fix_erratum_688(void *info) +{ +#define MSR_AMD64_IC_CFG 0xC0011021 + + msr_set_bit(MSR_AMD64_IC_CFG, 3); + msr_set_bit(MSR_AMD64_IC_CFG, 14); +} + +/* Apply erratum 688 fix so machines without a BIOS fix work. */ +static __init void fix_erratum_688(void) +{ + struct pci_dev *F4; + u32 val; + + if (boot_cpu_data.x86 != 0x14) + return; + + if (!amd_northbridges.num) + return; + + F4 = node_to_amd_nb(0)->link; + if (!F4) + return; + + if (pci_read_config_dword(F4, 0x164, &val)) + return; + + if (val & BIT(2)) + return; + + on_each_cpu(__fix_erratum_688, NULL, 0); + + pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n"); +} + static __init int init_amd_nbs(void) { amd_cache_northbridges(); amd_cache_gart(); + fix_erratum_688(); + return 0; } -- cgit v1.2.3 From 98990a33b77dda9babf91cb235654f6729e5702e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Oct 2017 11:21:33 -0500 Subject: x86/entry: Fix idtentry unwind hint This fixes the following ORC warning in the 'int3' entry code: WARNING: can't dereference iret registers at ffff8801c5f17fe0 for ip ffffffff95f0d94b The ORC metadata had the wrong stack offset for the iret registers. Their location on the stack is dependent on whether the exception has an error code. Reported-and-tested-by: Andrei Vagin Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations") Link: http://lkml.kernel.org/r/931d57f0551ed7979d5e7e05370d445c8e5137f8.1508516398.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 49167258d587..f6cdb7a1455e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -808,7 +808,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) - UNWIND_HINT_IRET_REGS offset=8 + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 -- cgit v1.2.3 From 58c3862b521ead4f69a24ef009a679cb3c519620 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Oct 2017 11:21:34 -0500 Subject: x86/unwind: Show function name+offset in ORC error messages Improve the warning messages to show the relevant function name+offset. This makes it much easier to diagnose problems with the ORC metadata. Before: WARNING: can't dereference iret registers at ffff8801c5f17fe0 for ip ffffffff95f0d94b After: WARNING: can't dereference iret registers at ffff880178f5ffe0 for ip int3+0x5b/0x60 Reported-by: Andrei Vagin Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Link: http://lkml.kernel.org/r/6bada6b9eac86017e16bd79e1e77877935cb50bb.1508516398.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_orc.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 570b70d3f604..b95007e7c1b3 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -86,8 +86,8 @@ static struct orc_entry *orc_find(unsigned long ip) idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; if (unlikely((idx >= lookup_num_blocks-1))) { - orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", - idx, lookup_num_blocks, ip); + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n", + idx, lookup_num_blocks, (void *)ip); return NULL; } @@ -96,8 +96,8 @@ static struct orc_entry *orc_find(unsigned long ip) if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || (__start_orc_unwind + stop > __stop_orc_unwind))) { - orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", - idx, lookup_num_blocks, start, stop, ip); + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n", + idx, lookup_num_blocks, start, stop, (void *)ip); return NULL; } @@ -373,7 +373,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_R10: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R10 at ip %p\n", + orc_warn("missing regs for base reg R10 at ip %pB\n", (void *)state->ip); goto done; } @@ -382,7 +382,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_R13: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R13 at ip %p\n", + orc_warn("missing regs for base reg R13 at ip %pB\n", (void *)state->ip); goto done; } @@ -391,7 +391,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_DI: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DI at ip %p\n", + orc_warn("missing regs for base reg DI at ip %pB\n", (void *)state->ip); goto done; } @@ -400,7 +400,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_DX: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DX at ip %p\n", + orc_warn("missing regs for base reg DX at ip %pB\n", (void *)state->ip); goto done; } @@ -408,7 +408,7 @@ bool unwind_next_frame(struct unwind_state *state) break; default: - orc_warn("unknown SP base reg %d for ip %p\n", + orc_warn("unknown SP base reg %d for ip %pB\n", orc->sp_reg, (void *)state->ip); goto done; } @@ -436,7 +436,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { - orc_warn("can't dereference registers at %p for ip %p\n", + orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } @@ -448,7 +448,7 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_TYPE_REGS_IRET: if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { - orc_warn("can't dereference iret registers at %p for ip %p\n", + orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } @@ -465,7 +465,8 @@ bool unwind_next_frame(struct unwind_state *state) break; default: - orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + orc_warn("unknown .orc_unwind entry type %d for ip %pB\n", + orc->type, (void *)orig_ip); break; } @@ -487,7 +488,7 @@ bool unwind_next_frame(struct unwind_state *state) break; default: - orc_warn("unknown BP base reg %d for ip %p\n", + orc_warn("unknown BP base reg %d for ip %pB\n", orc->bp_reg, (void *)orig_ip); goto done; } @@ -496,7 +497,7 @@ bool unwind_next_frame(struct unwind_state *state) if (state->stack_info.type == prev_type && on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && state->sp <= prev_sp) { - orc_warn("stack going in the wrong direction? ip=%p\n", + orc_warn("stack going in the wrong direction? ip=%pB\n", (void *)orig_ip); goto done; } -- cgit v1.2.3