From 7ad816762f9bf89e940e618ea40c43138b479e10 Mon Sep 17 00:00:00 2001 From: Petteri Aimonen Date: Tue, 16 Jun 2020 11:12:57 +0200 Subject: x86/fpu: Reset MXCSR to default in kernel_fpu_begin() Previously, kernel floating point code would run with the MXCSR control register value last set by userland code by the thread that was active on the CPU core just before kernel call. This could affect calculation results if rounding mode was changed, or a crash if a FPU/SIMD exception was unmasked. Restore MXCSR to the kernel's default value. [ bp: Carve out from a bigger patch by Petteri, add feature check, add FNINIT call too (amluto). ] Signed-off-by: Petteri Aimonen Signed-off-by: Borislav Petkov Link: https://bugzilla.kernel.org/show_bug.cgi?id=207979 Link: https://lkml.kernel.org/r/20200624114646.28953-2-bp@alien8.de --- arch/x86/include/asm/fpu/internal.h | 5 +++++ arch/x86/kernel/fpu/core.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f45bf9c..845e7481ab77 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -623,6 +623,11 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * MXCSR and XCR definitions: */ +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + extern unsigned int mxcsr_feature_mask; #define XCR_XFEATURE_ENABLED_MASK 0x00000000 diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06c818967bb6..15247b96c6ea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,6 +101,12 @@ void kernel_fpu_begin(void) copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); + + if (boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin); -- cgit v1.2.3 From 9d3c447c72fb2337ca39f245c6ae89f2369de216 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 29 Jun 2020 18:26:31 +0800 Subject: KVM: X86: Fix async pf caused null-ptr-deref Syzbot reported that: CPU: 1 PID: 6780 Comm: syz-executor153 Not tainted 5.7.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__apic_accept_irq+0x46/0xb80 Call Trace: kvm_arch_async_page_present+0x7de/0x9e0 kvm_check_async_pf_completion+0x18d/0x400 kvm_arch_vcpu_ioctl_run+0x18bf/0x69f0 kvm_vcpu_ioctl+0x46a/0xe20 ksys_ioctl+0x11a/0x180 __x64_sys_ioctl+0x6f/0xb0 do_syscall_64+0xf6/0x7d0 entry_SYSCALL_64_after_hwframe+0x49/0xb3 The testcase enables APF mechanism in MSR_KVM_ASYNC_PF_EN with ASYNC_PF_INT enabled w/o setting MSR_KVM_ASYNC_PF_INT before, what's worse, interrupt based APF 'page ready' event delivery depends on in kernel lapic, however, we didn't bail out when lapic is not in kernel during guest setting MSR_KVM_ASYNC_PF_EN which causes the null-ptr-deref in host later. This patch fixes it. Reported-by: syzbot+1bf777dfdde86d64b89b@syzkaller.appspotmail.com Fixes: 2635b5c4a0 (KVM: x86: interrupt based APF 'page ready' event delivery) Signed-off-by: Wanpeng Li Message-Id: <1593426391-8231-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b92db412335..a026d926072c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2693,6 +2693,9 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!lapic_in_kernel(vcpu)) + return 1; + vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { -- cgit v1.2.3 From 5ecad245de2ae23dc4e2dbece92f8ccfbaed2fa7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Jun 2020 07:07:20 -0400 Subject: KVM: x86: bit 8 of non-leaf PDPEs is not reserved Bit 8 would be the "global" bit, which does not quite make sense for non-leaf page table entries. Intel ignores it; AMD ignores it in PDEs and PDPEs, but reserves it in PML4Es. Probably, earlier versions of the AMD manual documented it as reserved in PDPEs as well, and that behavior made it into KVM as well as kvm-unit-tests; fix it. Cc: stable@vger.kernel.org Reported-by: Nadav Amit Fixes: a0c0feb57992 ("KVM: x86: reserve bit 8 of non-leaf PDPEs and PML4Es in 64-bit mode on AMD", 2014-09-03) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 76817d13c86e..6d6a0ae7800c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4449,7 +4449,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); -- cgit v1.2.3 From 009bce1df0bb5eb970b9eb98d963861f7fe353c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Jun 2020 12:26:05 -0700 Subject: x86/split_lock: Don't write MSR_TEST_CTRL on CPUs that aren't whitelisted Choo! Choo! All aboard the Split Lock Express, with direct service to Wreckage! Skip split_lock_verify_msr() if the CPU isn't whitelisted as a possible SLD-enabled CPU model to avoid writing MSR_TEST_CTRL. MSR_TEST_CTRL exists, and is writable, on many generations of CPUs. Writing the MSR, even with '0', can result in bizarre, undocumented behavior. This fixes a crash on Haswell when resuming from suspend with a live KVM guest. Because APs use the standard SMP boot flow for resume, they will go through split_lock_init() and the subsequent RDMSR/WRMSR sequence, which runs even when sld_state==sld_off to ensure SLD is disabled. On Haswell (at least, my Haswell), writing MSR_TEST_CTRL with '0' will succeed and _may_ take the SMT _sibling_ out of VMX root mode. When KVM has an active guest, KVM performs VMXON as part of CPU onlining (see kvm_starting_cpu()). Because SMP boot is serialized, the resulting flow is effectively: on_each_ap_cpu() { WRMSR(MSR_TEST_CTRL, 0) VMXON } As a result, the WRMSR can disable VMX on a different CPU that has already done VMXON. This ultimately results in a #UD on VMPTRLD when KVM regains control and attempt run its vCPUs. The above voodoo was confirmed by reworking KVM's VMXON flow to write MSR_TEST_CTRL prior to VMXON, and to serialize the sequence as above. Further verification of the insanity was done by redoing VMXON on all APs after the initial WRMSR->VMXON sequence. The additional VMXON, which should VM-Fail, occasionally succeeded, and also eliminated the unexpected #UD on VMPTRLD. The damage done by writing MSR_TEST_CTRL doesn't appear to be limited to VMX, e.g. after suspend with an active KVM guest, subsequent reboots almost always hang (even when fudging VMXON), a #UD on a random Jcc was observed, suspend/resume stability is qualitatively poor, and so on and so forth. kernel BUG at arch/x86/kvm/x86.c:386! CPU: 1 PID: 2592 Comm: CPU 6/KVM Tainted: G D Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:kvm_spurious_fault+0xf/0x20 Call Trace: vmx_vcpu_load_vmcs+0x1fb/0x2b0 vmx_vcpu_load+0x3e/0x160 kvm_arch_vcpu_load+0x48/0x260 finish_task_switch+0x140/0x260 __schedule+0x460/0x720 _cond_resched+0x2d/0x40 kvm_arch_vcpu_ioctl_run+0x82e/0x1ca0 kvm_vcpu_ioctl+0x363/0x5c0 ksys_ioctl+0x88/0xa0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x4c/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: dbaba47085b0c ("x86/split_lock: Rework the initialization flow of split lock detection") Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200605192605.7439-1-sean.j.christopherson@intel.com --- arch/x86/kernel/cpu/intel.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c25a67a34bd3..0ab48f1cdf84 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -49,6 +49,13 @@ enum split_lock_detect_state { static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -1071,7 +1078,8 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { - split_lock_verify_msr(sld_state != sld_off); + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); } static void split_lock_warn(unsigned long ip) @@ -1177,5 +1185,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) return; } + cpu_model_supports_sld = true; split_lock_setup(); } -- cgit v1.2.3 From c9c26150e61de441ab58b25c1f64afc049ee0fee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:11 -0700 Subject: x86/entry: Assert that syscalls are on the right stack Now that the entry stack is a full page, it's too easy to regress the system call entry code and end up on the wrong stack without noticing. Assert that all system calls (SYSCALL64, SYSCALL32, SYSENTER, and INT80) are on the right stack and have pt_regs in the right place. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/52059e42bb0ab8551153d012d68f7be18d72ff8e.1593191971.git.luto@kernel.org --- arch/x86/entry/common.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bd3f14175193..ed8ccc820995 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,15 @@ #define CREATE_TRACE_POINTS #include +/* Check that the stack and regs on entry from user mode are sane. */ +static void check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} + #ifdef CONFIG_CONTEXT_TRACKING /** * enter_from_user_mode - Establish state when coming from user mode @@ -127,9 +136,6 @@ static long syscall_trace_enter(struct pt_regs *regs) unsigned long ret = 0; u32 work; - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags); if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { @@ -346,6 +352,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -409,6 +417,8 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -460,6 +470,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) vdso_image_32.sym_int80_landing_pad; bool success; + check_user_regs(regs); + /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. -- cgit v1.2.3 From d1721250f3ffed9afba3e1fb729947cec64c5a8a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:12 -0700 Subject: x86/entry: Move SYSENTER's regs->sp and regs->flags fixups into C The SYSENTER asm (32-bit and compat) contains fixups for regs->sp and regs->flags. Move the fixups into C and fix some comments while at it. This is a valid cleanup all by itself, and it also simplifies the subsequent patch that will fix Xen PV SYSENTER. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fe62bef67eda7fac75b8f3dbafccf571dc4ece6b.1593191971.git.luto@kernel.org --- arch/x86/entry/common.c | 12 ++++++++++++ arch/x86/entry/entry_32.S | 5 ++--- arch/x86/entry/entry_64_compat.S | 11 +++++------ 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ed8ccc820995..f392a8bcd1c3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -522,6 +522,18 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} #endif SYSCALL_DEFINE0(ni_syscall) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 024d7d276cd4..2d0bd5d5f032 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -933,9 +933,8 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushl $0 /* pt_regs->sp (placeholder) */ pushfl /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ @@ -965,7 +964,7 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_flags_fixed: movl %esp, %eax - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0f974ae01e62..7b9d8150f652 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -68,16 +68,15 @@ SYM_CODE_START(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp (stashed in bp) */ + pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* * Push flags. This is nasty. First, interrupts are currently - * off, but we need pt_regs->flags to have IF set. Second, even - * if TF was set when SYSENTER started, it's clear by now. We fix - * that later using TIF_SINGLESTEP. + * off, but we need pt_regs->flags to have IF set. Second, if TS + * was set in usermode, it's still set, and we're singlestepping + * through this code. do_SYSENTER_32() will fix up IF. */ pushfq /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ @@ -135,7 +134,7 @@ SYM_CODE_START(entry_SYSENTER_compat) .Lsysenter_flags_fixed: movq %rsp, %rdi - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV -- cgit v1.2.3 From ffae641f57476369b4d503402b37ebe489d23395 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 26 Jun 2020 10:21:13 -0700 Subject: x86/entry/64/compat: Fix Xen PV SYSENTER frame setup The SYSENTER frame setup was nonsense. It worked by accident because the normal code into which the Xen asm jumped (entry_SYSENTER_32/compat) threw away SP without touching the stack. entry_SYSENTER_compat was recently modified such that it relied on having a valid stack pointer, so now the Xen asm needs to invoke it with a valid stack. Fix it up like SYSCALL: use the Xen-provided frame and skip the bare metal prologue. Fixes: 1c3e5d3f60e2 ("x86/entry: Make entry_64_compat.S objtool clean") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Boris Ostrovsky Link: https://lkml.kernel.org/r/947880c41ade688ff4836f665d0c9fcaa9bd1201.1593191971.git.luto@kernel.org --- arch/x86/entry/entry_64_compat.S | 1 + arch/x86/xen/xen-asm_64.S | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 7b9d8150f652..381a6de7de9c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -79,6 +79,7 @@ SYM_CODE_START(entry_SYSENTER_compat) pushfq /* pt_regs->flags (except IF = 0) */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ +SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5d252aaeade8..e1e1c7eafa60 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -161,10 +161,22 @@ SYM_FUNC_END(xen_syscall32_target) /* 32-bit compat sysenter target */ SYM_FUNC_START(xen_sysenter_target) - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp - jmp entry_SYSENTER_compat + /* + * NB: Xen is polite and clears TF from EFLAGS for us. This means + * that we don't need to guard against single step exceptions here. + */ + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSENTER_compat_after_hwframe SYM_FUNC_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ -- cgit v1.2.3 From d74fcfc1f0ff4b6c26ecef1f9e48d8089ab4eaac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 19:17:14 -0700 Subject: KVM: x86: Inject #GP if guest attempts to toggle CR4.LA57 in 64-bit mode Inject a #GP on MOV CR4 if CR4.LA57 is toggled in 64-bit mode, which is illegal per Intel's SDM: CR4.LA57 57-bit linear addresses (bit 12 of CR4) ... blah blah blah ... This bit cannot be modified in IA-32e mode. Note, the pseudocode for MOV CR doesn't call out the fault condition, which is likely why the check was missed during initial development. This is arguably an SDM bug and will hopefully be fixed in future release of the SDM. Fixes: fd8cb433734ee ("KVM: MMU: Expose the LA57 feature to VM.") Cc: stable@vger.kernel.org Reported-by: Sebastien Boeuf Signed-off-by: Sean Christopherson Message-Id: <20200703021714.5549-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a026d926072c..88c593f83b28 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, -- cgit v1.2.3 From 7c83d096aed055a7763a03384f92115363448b71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 21:04:21 -0700 Subject: KVM: x86: Mark CR4.TSD as being possibly owned by the guest Mark CR4.TSD as being possibly owned by the guest as that is indeed the case on VMX. Without TSD being tagged as possibly owned by the guest, a targeted read of CR4 to get TSD could observe a stale value. This bug is benign in the current code base as the sole consumer of TSD is the emulator (for RDTSC) and the emulator always "reads" the entirety of CR4 when grabbing bits. Add a build-time assertion in to ensure VMX doesn't hand over more CR4 bits without also updating x86. Fixes: 52ce3c21aec3 ("x86,kvm,vmx: Don't trap writes to CR4.TSD") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20200703040422.31536-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 +- arch/x86/kvm/vmx/vmx.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ff2d0e9ca3bc..cfe83d4ae625 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb22f33bf1d8..5c9bfc0b9ab9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4034,6 +4034,8 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { + BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; -- cgit v1.2.3 From fa71e9527f6a0153ae6a880031b902818af1bdaf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 2 Jul 2020 21:04:22 -0700 Subject: KVM: VMX: Use KVM_POSSIBLE_CR*_GUEST_BITS to initialize guest/host masks Use the "common" KVM_POSSIBLE_CR*_GUEST_BITS defines to initialize the CR0/CR4 guest host masks instead of duplicating most of the CR4 mask and open coding the CR0 mask. SVM doesn't utilize the masks, i.e. the masks are effectively VMX specific even if they're not named as such. This avoids duplicate code, better documents the guest owned CR0 bit, and eliminates the need for a build-time assertion to keep VMX and x86 synchronized. Signed-off-by: Sean Christopherson Message-Id: <20200703040422.31536-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 15 +++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d1af20b050a8..b26655104d4a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4109,7 +4109,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4259,7 +4259,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c9bfc0b9ab9..13745f2a5ecd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -133,9 +133,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -4034,11 +4031,9 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); - - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + if (!enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vmx->vcpu.arch.cr4_guest_owned_bits &= ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; @@ -4335,8 +4330,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); - vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; - vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); + vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); -- cgit v1.2.3 From a3a66c3822e03692ed7c5888e8f2d384cc698d34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jul 2020 15:15:27 -0700 Subject: vmalloc: fix the owner argument for the new __vmalloc_node_range callers Fix the recently added new __vmalloc_node_range callers to pass the correct values as the owner for display in /proc/vmallocinfo. Fixes: 800e26b81311 ("x86/hyperv: allocate the hypercall page with only read and execute bits") Fixes: 10d5e97c1bf8 ("arm64: use PAGE_KERNEL_ROX directly in alloc_insn_page") Fixes: 7a0e27b2a0ce ("mm: remove vmalloc_exec") Reported-by: Ard Biesheuvel Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200627075649.2455097-1-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/kernel/probes/kprobes.c | 2 +- arch/x86/hyperv/hv_init.c | 3 ++- kernel/module.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index cbe49cd117cf..5290f17a4d80 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -122,7 +122,7 @@ void *alloc_insn_page(void) { return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __func__); + NUMA_NO_NODE, __builtin_return_address(0)); } /* arm kprobe: install breakpoint in text */ diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2bdc72e6890e..6035df1b49e1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -377,7 +377,8 @@ void __init hyperv_init(void) hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, - VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __func__); + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/kernel/module.c b/kernel/module.c index 0c6573b98c36..bee1c25ca5c5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2785,7 +2785,7 @@ void * __weak module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __func__); + NUMA_NO_NODE, __builtin_return_address(0)); } bool __weak module_init_section(const char *name) -- cgit v1.2.3 From db5b2c5a90a111618f071d231a8b945cf522313e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:53 -0700 Subject: x86/entry/compat: Clear RAX high bits on Xen PV SYSENTER Move the clearing of the high bits of RAX after Xen PV joins the SYSENTER path so that Xen PV doesn't skip it. Arguably this code should be deleted instead, but that would belong in the merge window. Fixes: ffae641f5747 ("x86/entry/64/compat: Fix Xen PV SYSENTER frame setup") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/9d33b3f3216dcab008070f1c28b6091ae7199969.1593795633.git.luto@kernel.org --- arch/x86/entry/entry_64_compat.S | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 381a6de7de9c..541fdaf64045 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,15 +57,6 @@ SYM_CODE_START(entry_SYSENTER_compat) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq $0 /* pt_regs->sp = 0 (placeholder) */ @@ -80,6 +71,16 @@ SYM_CODE_START(entry_SYSENTER_compat) pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ -- cgit v1.2.3 From 3c73b81a9164d0c1b6379d6672d2772a9e95168e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:54 -0700 Subject: x86/entry, selftests: Further improve user entry sanity checks Chasing down a Xen bug caused me to realize that the new entry sanity checks are still fairly weak. Add some more checks. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/881de09e786ab93ce56ee4a2437ba2c308afe7a9.1593795633.git.luto@kernel.org --- arch/x86/entry/common.c | 19 +++++++++++++++++++ tools/testing/selftests/x86/syscall_nt.c | 11 +++++++++++ 2 files changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f392a8bcd1c3..e83b3f14897c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,6 +49,23 @@ static void check_user_regs(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ WARN_ON_ONCE(!on_thread_stack()); WARN_ON_ONCE(regs != task_pt_regs(current)); } @@ -577,6 +594,7 @@ SYSCALL_DEFINE0(ni_syscall) bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) { if (user_mode(regs)) { + check_user_regs(regs); enter_from_user_mode(); return false; } @@ -710,6 +728,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) */ void noinstr idtentry_enter_user(struct pt_regs *regs) { + check_user_regs(regs); enter_from_user_mode(); } diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 970e5e14d96d..a108b80dd082 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -81,5 +81,16 @@ int main(void) printf("[RUN]\tSet NT|AC|TF and issue a syscall\n"); do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF); + /* + * Now try DF. This is evil and it's plausible that we will crash + * glibc, but glibc would have to do something rather surprising + * for this to happen. + */ + printf("[RUN]\tSet DF and issue a syscall\n"); + do_it(X86_EFLAGS_DF); + + printf("[RUN]\tSet TF|DF and issue a syscall\n"); + do_it(X86_EFLAGS_TF | X86_EFLAGS_DF); + return nerrs == 0 ? 0 : 1; } -- cgit v1.2.3 From f41f0824224eb12ad84de8972962dd54be5abe3b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:55 -0700 Subject: x86/entry/xen: Route #DB correctly on Xen PV On Xen PV, #DB doesn't use IST. It still needs to be correctly routed depending on whether it came from user or kernel mode. Get rid of DECLARE/DEFINE_IDTENTRY_XEN -- it was too hard to follow the logic. Instead, route #DB and NMI through DECLARE/DEFINE_IDTENTRY_RAW on Xen, and do the right thing for #DB. Also add more warnings to the exc_debug* handlers to make this type of failure more obvious. This fixes various forms of corruption that happen when usermode triggers #DB on Xen PV. Fixes: 4c0dcd8350a0 ("x86/entry: Implement user mode C entry points for #DB and #MCE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/4163e733cce0b41658e252c6c6b3464f33fdff17.1593795633.git.luto@kernel.org --- arch/x86/include/asm/idtentry.h | 24 ++++++------------------ arch/x86/kernel/traps.c | 12 ++++++++++++ arch/x86/xen/enlighten_pv.c | 28 ++++++++++++++++++++++++---- arch/x86/xen/xen-asm_64.S | 5 ++--- 4 files changed, 44 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cf51c50eb356..94333ac3092b 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -398,18 +398,6 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST -/** - * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points - * @vector: Vector number (ignored for C) - * @func: Function name of the entry point - * - * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM - * indirection magic. - */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - asmlinkage void xen_asm_exc_xen##func(void); \ - asmlinkage void asm_exc_xen##func(void) - #else /* !__ASSEMBLY__ */ /* @@ -469,10 +457,6 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM code emitted for NMI */ #define DECLARE_IDTENTRY_NMI(vector, func) -/* XEN NMI and DB wrapper */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - idtentry vector asm_exc_xen##func exc_##func has_error_code=0 - /* * ASM code to emit the common vector entry stubs where each stub is * packed into 8 bytes. @@ -570,11 +554,15 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); +#endif /* #DB */ DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); -DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); +#endif /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f9727b96961f..c17f9b57171f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -865,6 +865,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, instrumentation_begin(); trace_hardirqs_off_finish(); + /* + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(user_mode(regs)); + /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. @@ -883,6 +889,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + idtentry_enter_user(regs); instrumentation_begin(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index acc49fa6a097..0d68948c82ad 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -598,6 +598,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, } #ifdef CONFIG_X86_64 +void noist_exc_debug(struct pt_regs *regs); + +DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) +{ + /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + exc_nmi(regs); +} + +DEFINE_IDTENTRY_RAW(xenpv_exc_debug) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -609,18 +629,18 @@ struct trap_array_entry { .xen = xen_asm_##func, \ .ist_okay = ist_ok } -#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ +#define TRAP_ENTRY_REDIR(func, ist_ok) { \ .orig = asm_##func, \ - .xen = xen_asm_##xenfunc, \ + .xen = xen_asm_xenpv_##func, \ .ist_okay = ist_ok } static struct trap_array_entry trap_array[] = { - TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), + TRAP_ENTRY_REDIR(exc_debug, true ), TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif - TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), + TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index e1e1c7eafa60..aab1d99b2b48 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,10 +29,9 @@ _ASM_NOKPROBE(xen_\name) .endm xen_pv_trap asm_exc_divide_error -xen_pv_trap asm_exc_debug -xen_pv_trap asm_exc_xendebug +xen_pv_trap asm_xenpv_exc_debug xen_pv_trap asm_exc_int3 -xen_pv_trap asm_exc_xennmi +xen_pv_trap asm_xenpv_exc_nmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op -- cgit v1.2.3 From 13cbc0cd4a30c815984ad88e3a2e5976493516a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:56 -0700 Subject: x86/entry/32: Fix #MC and #DB wiring on x86_32 DEFINE_IDTENTRY_MCE and DEFINE_IDTENTRY_DEBUG were wired up as non-RAW on x86_32, but the code expected them to be RAW. Get rid of all the macro indirection for them on 32-bit and just use DECLARE_IDTENTRY_RAW and DEFINE_IDTENTRY_RAW directly. Also add a warning to make sure that we only hit the _kernel paths in kernel mode. Reported-by: Naresh Kamboju Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/9e90a7ee8e72fd757db6d92e1e5ff16339c1ecf9.1593795633.git.luto@kernel.org --- arch/x86/include/asm/idtentry.h | 23 +++++++++++++---------- arch/x86/kernel/cpu/mce/core.c | 4 +++- arch/x86/kernel/traps.c | 2 +- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 94333ac3092b..eeac6dc2adaa 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -353,10 +353,6 @@ static __always_inline void __##func(struct pt_regs *regs) #else /* CONFIG_X86_64 */ -/* Maps to a regular IDTENTRY on 32bit for now */ -# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY -# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY - /** * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant * @vector: Vector number (ignored for C) @@ -387,16 +383,18 @@ __visible noinstr void func(struct pt_regs *regs, \ #endif /* !CONFIG_X86_64 */ /* C-Code mapping */ +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW + +#ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST -#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW -#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW - #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#endif #else /* !__ASSEMBLY__ */ @@ -443,9 +441,6 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) -# define DECLARE_IDTENTRY_DEBUG(vector, func) \ - DECLARE_IDTENTRY(vector, func) - /* No ASM emitted for DF as this goes through a C shim */ # define DECLARE_IDTENTRY_DF(vector, func) @@ -549,7 +544,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); #ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); +#endif #endif /* NMI */ @@ -559,7 +558,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); #endif /* #DB */ +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); +#endif #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); #endif diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ce9120c4f740..a6a90b5d7c83 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1901,6 +1901,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + WARN_ON_ONCE(user_mode(regs)); + /* * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. @@ -1954,7 +1956,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) } #else /* 32bit unified entry point */ -DEFINE_IDTENTRY_MCE(exc_machine_check) +DEFINE_IDTENTRY_RAW(exc_machine_check) { unsigned long dr7; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c17f9b57171f..6ed8cc5fbe8f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -925,7 +925,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) } #else /* 32 bit does not have separate entry points. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6, dr7; -- cgit v1.2.3 From cc801833a171163edb6385425349ba8903bd1b20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2020 10:02:57 -0700 Subject: x86/ldt: Disable 16-bit segments on Xen PV Xen PV doesn't implement ESPFIX64, so they don't work right. Disable them. Also print a warning the first time anyone tries to use a 16-bit segment on a Xen PV guest that would otherwise allow it to help people diagnose this change in behavior. This gets us closer to having all x86 selftests pass on Xen PV. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/92b2975459dfe5929ecf34c3896ad920bd9e3f2d.1593795633.git.luto@kernel.org --- arch/x86/kernel/ldt.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8748321c4486..34e918ad34d4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -29,6 +29,8 @@ #include #include +#include + /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) @@ -543,6 +545,37 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) return bytecount; } +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + static DEFINE_MUTEX(xen_warning); + static bool warned; + + mutex_lock(&xen_warning); + if (!warned) { + pr_info("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + warned = true; + } + mutex_unlock(&xen_warning); + + return false; + } +#endif + + return true; +} + static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; @@ -574,7 +607,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } -- cgit v1.2.3 From a4c0e91d1d65bc58f928b80ed824e10e165da22c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Jul 2020 21:33:11 +0200 Subject: x86/entry/32: Fix XEN_PV build dependency xenpv_exc_nmi() and xenpv_exc_debug() are only defined on 64-bit kernels, but they snuck into the 32-bit build via , causing the link to fail: ld: arch/x86/entry/entry_32.o: in function `asm_xenpv_exc_nmi': (.entry.text+0x817): undefined reference to `xenpv_exc_nmi' ld: arch/x86/entry/entry_32.o: in function `asm_xenpv_exc_debug': (.entry.text+0x827): undefined reference to `xenpv_exc_debug' Only use them on 64-bit kernels. Fixes: f41f0824224e: ("x86/entry/xen: Route #DB correctly on Xen PV") Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idtentry.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index eeac6dc2adaa..f3d70830bf2a 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -553,7 +553,7 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -#ifdef CONFIG_XEN_PV +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); #endif @@ -563,7 +563,7 @@ DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); #else DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); #endif -#ifdef CONFIG_XEN_PV +#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64) DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); #endif -- cgit v1.2.3 From bb5a93aaf25261321db0c499cde7da6ee9d8b164 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Jul 2020 12:50:20 -0700 Subject: x86/ldt: use "pr_info_once()" instead of open-coding it badly Using a mutex for "print this warning only once" is so overdesigned as to be actively offensive to my sensitive stomach. Just use "pr_info_once()" that already does this, although in a (harmlessly) racy manner that can in theory cause the message to be printed twice if more than one CPU races on that "is this the first time" test. [ If somebody really cares about that harmless data race (which sounds very unlikely indeed), that person can trivially fix printk_once() by using a simple atomic access, preferably with an optimistic non-atomic test first before even bothering to treat the pointless "make sure it is _really_ just once" case. A mutex is most definitely never the right primitive to use for something like this. ] Yes, this is a small and meaningless detail in a code path that hardly matters. But let's keep some code quality standards here, and not accept outrageously bad code. Link: https://lore.kernel.org/lkml/CAHk-=wgV9toS7GU3KmNpj8hCS9SeF+A0voHS8F275_mgLhL4Lw@mail.gmail.com/ Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- arch/x86/kernel/ldt.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 34e918ad34d4..b8aee71840ae 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -559,16 +559,7 @@ static bool allow_16bit_segments(void) * is worthwhile, disallow 16-bit segments under Xen PV. */ if (xen_pv_domain()) { - static DEFINE_MUTEX(xen_warning); - static bool warned; - - mutex_lock(&xen_warning); - if (!warned) { - pr_info("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); - warned = true; - } - mutex_unlock(&xen_warning); - + pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); return false; } #endif -- cgit v1.2.3 From bce9b042ec73e8662b8119d4ca47e7c78b20d0bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:28:05 +0200 Subject: x86/traps: Disable interrupts in exc_aligment_check() exc_alignment_check() fails to disable interrupts before returning to the entry code. Fixes: ca4c6a9858c2 ("x86/traps: Make interrupt enable/disable symmetric in C code") Reported-by: syzbot+0889df9502bc0f112b31@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200708192934.076519438@linutronix.de --- arch/x86/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6ed8cc5fbe8f..4f3a509e5547 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,8 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); + + local_irq_disable(); } #ifdef CONFIG_VMAP_STACK -- cgit v1.2.3 From 006e1ced516d2bfd9db63a32b5dba3c2abf43b04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:28:06 +0200 Subject: x86/entry: Mark check_user_regs() noinstr It's called from the non-instrumentable section. Fixes: c9c26150e61d ("x86/entry: Assert that syscalls are on the right stack") Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200708192934.191497962@linutronix.de --- arch/x86/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e83b3f14897c..ea7b515e3bc2 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -46,7 +46,7 @@ #include /* Check that the stack and regs on entry from user mode are sane. */ -static void check_user_regs(struct pt_regs *regs) +static noinstr void check_user_regs(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { /* -- cgit v1.2.3 From bd87e6f6610aa96fde01ee6653e162213f7ec836 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:28:07 +0200 Subject: x86/entry/common: Make prepare_exit_to_usermode() static No users outside this file anymore. Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200708192934.301116609@linutronix.de --- arch/x86/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ea7b515e3bc2..f09288431f28 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -294,7 +294,7 @@ static void __prepare_exit_to_usermode(struct pt_regs *regs) #endif } -__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) +static noinstr void prepare_exit_to_usermode(struct pt_regs *regs) { instrumentation_begin(); __prepare_exit_to_usermode(regs); -- cgit v1.2.3 From 83d31e5271ac74aad14b5a1a2ed26923e1446329 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Jul 2020 13:12:09 -0400 Subject: KVM: nVMX: fixes for preemption timer migration Commit 850448f35aaf ("KVM: nVMX: Fix VMX preemption timer migration", 2020-06-01) accidentally broke nVMX live migration from older version by changing the userspace ABI. Restore it and, while at it, ensure that vmx->nested.has_preemption_timer_deadline is always initialized according to the KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE flag. Cc: Makarand Sonare Fixes: 850448f35aaf ("KVM: nVMX: Fix VMX preemption timer migration") Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 5 +++-- arch/x86/include/uapi/asm/kvm.h | 5 +++-- arch/x86/kvm/vmx/nested.c | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..320788f81a05 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4339,14 +4339,15 @@ Errors: #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 struct kvm_vmx_nested_state_hdr { - __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; - __u64 preemption_timer_deadline; struct { __u16 flags; } smm; + + __u32 flags; + __u64 preemption_timer_deadline; }; struct kvm_vmx_nested_state_data { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 17c5a038f42d..0780f97c1850 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -408,14 +408,15 @@ struct kvm_vmx_nested_state_data { }; struct kvm_vmx_nested_state_hdr { - __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; - __u64 preemption_timer_deadline; struct { __u16 flags; } smm; + + __u32 flags; + __u64 preemption_timer_deadline; }; struct kvm_svm_nested_state_data { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b26655104d4a..d4a4cec034d0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6176,6 +6176,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; } + vmx->nested.has_preemption_timer_deadline = false; if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { vmx->nested.has_preemption_timer_deadline = true; vmx->nested.preemption_timer_deadline = -- cgit v1.2.3 From e3beca48a45b5e0e6e6a4e0124276b8248dcc9bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Jul 2020 11:53:06 +0200 Subject: irqdomain/treewide: Keep firmware node unconditionally allocated Quite some non OF/ACPI users of irqdomains allocate firmware nodes of type IRQCHIP_FWNODE_NAMED or IRQCHIP_FWNODE_NAMED_ID and free them right after creating the irqdomain. The only purpose of these FW nodes is to convey name information. When this was introduced the core code did not store the pointer to the node in the irqdomain. A recent change stored the firmware node pointer in irqdomain for other reasons and missed to notice that the usage sites which do the alloc_fwnode/create_domain/free_fwnode sequence are broken by this. Storing a dangling pointer is dangerous itself, but in case that the domain is destroyed later on this leads to a double free. Remove the freeing of the firmware node after creating the irqdomain from all affected call sites to cure this. Fixes: 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") Reported-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/873661qakd.fsf@nanos.tec.linutronix.de --- arch/mips/pci/pci-xtalk-bridge.c | 5 +++-- arch/x86/kernel/apic/io_apic.c | 10 +++++----- arch/x86/kernel/apic/msi.c | 18 ++++++++++++------ arch/x86/kernel/apic/vector.c | 1 - arch/x86/platform/uv/uv_irq.c | 3 ++- drivers/iommu/amd/iommu.c | 5 +++-- drivers/iommu/hyperv-iommu.c | 5 ++++- drivers/iommu/intel/irq_remapping.c | 2 +- drivers/mfd/ioc3.c | 5 +++-- drivers/pci/controller/vmd.c | 5 +++-- 10 files changed, 36 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 3b2552fb7735..5958217861b8 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -627,9 +627,10 @@ static int bridge_probe(struct platform_device *pdev) return -ENOMEM; domain = irq_domain_create_hierarchy(parent, 0, 8, fn, &bridge_domain_ops, NULL); - irq_domain_free_fwnode(fn); - if (!domain) + if (!domain) { + irq_domain_free_fwnode(fn); return -ENOMEM; + } pci_set_flags(PCI_PROBE_ONLY); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce61e3e7d399..81ffcfbfaef2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2316,12 +2316,12 @@ static int mp_irqdomain_create(int ioapic) ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); - /* Release fw handle if it was allocated above */ - if (!cfg->dev) - irq_domain_free_fwnode(fn); - - if (!ip->irqdomain) + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); return -ENOMEM; + } ip->irqdomain->parent = parent; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5cbaca58af95..c2b2911feeef 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -263,12 +263,13 @@ void __init arch_init_msi_domain(struct irq_domain *parent) msi_default_domain = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, parent); - irq_domain_free_fwnode(fn); } - if (!msi_default_domain) + if (!msi_default_domain) { + irq_domain_free_fwnode(fn); pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); - else + } else { msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + } } #ifdef CONFIG_IRQ_REMAP @@ -301,7 +302,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, if (!fn) return NULL; d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) + irq_domain_free_fwnode(fn); return d; } #endif @@ -364,7 +366,8 @@ static struct irq_domain *dmar_get_irq_domain(void) if (fn) { dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); + if (!dmar_domain) + irq_domain_free_fwnode(fn); } out: mutex_unlock(&dmar_lock); @@ -489,7 +492,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) } d = msi_create_irq_domain(fn, domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } return d; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c48be6e1f676..cc8b16f89dd4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -709,7 +709,6 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index fc13cbbb2dce..abb6075397f0 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -167,9 +167,10 @@ static struct irq_domain *uv_get_irq_domain(void) goto out; uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL); - irq_domain_free_fwnode(fn); if (uv_domain) uv_domain->parent = x86_vector_domain; + else + irq_domain_free_fwnode(fn); out: mutex_unlock(&uv_lock); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 74cca1757172..2f22326ee4df 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3985,9 +3985,10 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) if (!fn) return -ENOMEM; iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu); - irq_domain_free_fwnode(fn); - if (!iommu->ir_domain) + if (!iommu->ir_domain) { + irq_domain_free_fwnode(fn); return -ENOMEM; + } iommu->ir_domain->parent = arch_get_ir_parent_domain(); iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain, diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 3c0c67a99c7b..8919c1c70b68 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -155,7 +155,10 @@ static int __init hyperv_prepare_irq_remapping(void) 0, IOAPIC_REMAPPING_ENTRY, fn, &hyperv_ir_domain_ops, NULL); - irq_domain_free_fwnode(fn); + if (!ioapic_ir_domain) { + irq_domain_free_fwnode(fn); + return -ENOMEM; + } /* * Hyper-V doesn't provide irq remapping function for diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 7f8769800815..9564d23d094f 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -563,8 +563,8 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) 0, INTR_REMAP_TABLE_ENTRIES, fn, &intel_ir_domain_ops, iommu); - irq_domain_free_fwnode(fn); if (!iommu->ir_domain) { + irq_domain_free_fwnode(fn); pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); goto out_free_bitmap; } diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c index 02998d4eb74b..74cee7cb0afc 100644 --- a/drivers/mfd/ioc3.c +++ b/drivers/mfd/ioc3.c @@ -142,10 +142,11 @@ static int ioc3_irq_domain_setup(struct ioc3_priv_data *ipd, int irq) goto err; domain = irq_domain_create_linear(fn, 24, &ioc3_irq_domain_ops, ipd); - if (!domain) + if (!domain) { + irq_domain_free_fwnode(fn); goto err; + } - irq_domain_free_fwnode(fn); ipd->domain = domain; irq_set_chained_handler_and_data(irq, ioc3_irq_handler, domain); diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index e386d4eac407..9a64cf90c291 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -546,9 +546,10 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) vmd->irq_domain = pci_msi_create_irq_domain(fn, &vmd_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); - if (!vmd->irq_domain) + if (!vmd->irq_domain) { + irq_domain_free_fwnode(fn); return -ENODEV; + } pci_add_resource(&resources, &vmd->resources[0]); pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); -- cgit v1.2.3 From 6ee93f8df09c470da1a4af11e394c52d7b62418c Mon Sep 17 00:00:00 2001 From: Jian Cai Date: Tue, 14 Jul 2020 16:30:21 -0700 Subject: x86/entry: Add compatibility with IAS Clang's integrated assembler does not allow symbols with non-absolute values to be reassigned. Modify the interrupt entry loop macro to be compatible with IAS by using a label and an offset. Reported-by: Nick Desaulniers Reported-by: Sedat Dilek Suggested-by: Nick Desaulniers Suggested-by: Brian Gerst Suggested-by: Arvind Sankar Signed-off-by: Jian Cai Signed-off-by: Thomas Gleixner Tested-by: Sedat Dilek # Link: https://github.com/ClangBuiltLinux/linux/issues/1043 Link: https://lkml.kernel.org/r/20200714233024.1789985-1-caij2003@gmail.com --- arch/x86/include/asm/idtentry.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index f3d70830bf2a..5efaaed34eda 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -469,16 +469,15 @@ __visible noinstr void func(struct pt_regs *regs, \ .align 8 SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR - pos = . .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) UNWIND_HINT_IRET_REGS +0 : .byte 0x6a, vector jmp asm_common_interrupt nop /* Ensure that the above is 8 bytes max */ - . = pos + 8 - pos=pos+8 - vector=vector+1 + . = 0b + 8 + vector = vector+1 .endr SYM_CODE_END(irq_entries_start) @@ -486,16 +485,15 @@ SYM_CODE_END(irq_entries_start) .align 8 SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR - pos = . .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) UNWIND_HINT_IRET_REGS +0 : .byte 0x6a, vector jmp asm_spurious_interrupt nop /* Ensure that the above is 8 bytes max */ - . = pos + 8 - pos=pos+8 - vector=vector+1 + . = 0b + 8 + vector = vector+1 .endr SYM_CODE_END(spurious_entries_start) #endif -- cgit v1.2.3 From 5769fe26f389b0002ed48fd16d642a1d86edaf79 Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Tue, 14 Jul 2020 21:47:40 +0200 Subject: x86/entry: Fix vectors to IDTENTRY_SYSVEC for CONFIG_HYPERV When assembling with Clang via `make LLVM_IAS=1` and CONFIG_HYPERV enabled, we observe the following error: :9:6: error: expected absolute expression .if HYPERVISOR_REENLIGHTENMENT_VECTOR == 3 ^ :1:1: note: while in macro instantiation idtentry HYPERVISOR_REENLIGHTENMENT_VECTOR asm_sysvec_hyperv_reenlightenment sysvec_hyperv_reenlightenment has_error_code=0 ^ ./arch/x86/include/asm/idtentry.h:627:1: note: while in macro instantiation idtentry_sysvec HYPERVISOR_REENLIGHTENMENT_VECTOR sysvec_hyperv_reenlightenment; ^ :9:6: error: expected absolute expression .if HYPERVISOR_STIMER0_VECTOR == 3 ^ :1:1: note: while in macro instantiation idtentry HYPERVISOR_STIMER0_VECTOR asm_sysvec_hyperv_stimer0 sysvec_hyperv_stimer0 has_error_code=0 ^ ./arch/x86/include/asm/idtentry.h:628:1: note: while in macro instantiation idtentry_sysvec HYPERVISOR_STIMER0_VECTOR sysvec_hyperv_stimer0; This is caused by typos in arch/x86/include/asm/idtentry.h: HYPERVISOR_REENLIGHTENMENT_VECTOR -> HYPERV_REENLIGHTENMENT_VECTOR HYPERVISOR_STIMER0_VECTOR -> HYPERV_STIMER0_VECTOR For more details see ClangBuiltLinux issue #1088. Fixes: a16be368dd3f ("x86/entry: Convert various hypervisor vectors to IDTENTRY_SYSVEC") Suggested-by: Nick Desaulniers Signed-off-by: Sedat Dilek Signed-off-by: Thomas Gleixner Reviewed-by: Nathan Chancellor Reviewed-by: Wei Liu Reviewed-by: Nick Desaulniers Link: https://github.com/ClangBuiltLinux/linux/issues/1088 Link: https://github.com/ClangBuiltLinux/linux/issues/1043 Link: https://lore.kernel.org/patchwork/patch/1272115/ Link: https://lkml.kernel.org/r/20200714194740.4548-1-sedat.dilek@gmail.com --- arch/x86/include/asm/idtentry.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 5efaaed34eda..80d3b30d3ee3 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -624,8 +624,8 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) -- cgit v1.2.3 From 81e96851ea32deb2c921c870eecabf335f598aeb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 27 May 2020 15:53:46 +0200 Subject: x86: math-emu: Fix up 'cmp' insn for clang ias The clang integrated assembler requires the 'cmp' instruction to have a length prefix here: arch/x86/math-emu/wm_sqrt.S:212:2: error: ambiguous instructions require an explicit suffix (could be 'cmpb', 'cmpw', or 'cmpl') cmp $0xffffffff,-24(%ebp) ^ Make this a 32-bit comparison, which it was clearly meant to be. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20200527135352.1198078-1-arnd@arndb.de --- arch/x86/math-emu/wm_sqrt.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index 3b2b58164ec1..40526dd85137 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -209,7 +209,7 @@ sqrt_stage_2_finish: #ifdef PARANOID /* It should be possible to get here only if the arg is ffff....ffff */ - cmp $0xffffffff,FPU_fsqrt_arg_1 + cmpl $0xffffffff,FPU_fsqrt_arg_1 jnz sqrt_stage_2_error #endif /* PARANOID */ -- cgit v1.2.3 From baedb87d1b53532f81b4bd0387f83b05d4f7eb9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2020 18:00:02 +0200 Subject: genirq/affinity: Handle affinity setting on inactive interrupts correctly Setting interrupt affinity on inactive interrupts is inconsistent when hierarchical irq domains are enabled. The core code should just store the affinity and not call into the irq chip driver for inactive interrupts because the chip drivers may not be in a state to handle such requests. X86 has a hacky workaround for that but all other irq chips have not which causes problems e.g. on GIC V3 ITS. Instead of adding more ugly hacks all over the place, solve the problem in the core code. If the affinity is set on an inactive interrupt then: - Store it in the irq descriptors affinity mask - Update the effective affinity to reflect that so user space has a consistent view - Don't call into the irq chip driver This is the core equivalent of the X86 workaround and works correctly because the affinity setting is established in the irq chip when the interrupt is activated later on. Note, that this is only effective when hierarchical irq domains are enabled by the architecture. Doing it unconditionally would break legacy irq chip implementations. For hierarchial irq domains this works correctly as none of the drivers can have a dependency on affinity setting in inactive state by design. Remove the X86 workaround as it is not longer required. Fixes: 02edee152d6e ("x86/apic/vector: Ignore set_affinity call for inactive interrupts") Reported-by: Ali Saidi Signed-off-by: Thomas Gleixner Tested-by: Ali Saidi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200529015501.15771-1-alisaidi@amazon.com Link: https://lkml.kernel.org/r/877dv2rv25.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/vector.c | 22 +++++----------------- kernel/irq/manage.c | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index cc8b16f89dd4..7649da2478d8 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -446,12 +446,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, trace_vector_activate(irqd->irq, apicd->is_managed, apicd->can_reserve, reserve); - /* Nothing to do for fixed assigned vectors */ - if (!apicd->can_reserve && !apicd->is_managed) - return 0; - raw_spin_lock_irqsave(&vector_lock, flags); - if (reserve || irqd_is_managed_and_shutdown(irqd)) + if (!apicd->can_reserve && !apicd->is_managed) + assign_irq_vector_any_locked(irqd); + else if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); @@ -774,20 +772,10 @@ void lapic_offline(void) static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { - struct apic_chip_data *apicd = apic_chip_data(irqd); int err; - /* - * Core code can call here for inactive interrupts. For inactive - * interrupts which use managed or reservation mode there is no - * point in going through the vector assignment right now as the - * activation will assign a vector which fits the destination - * cpumask. Let the core code store the destination mask and be - * done with it. - */ - if (!irqd_is_activated(irqd) && - (apicd->is_managed || apicd->can_reserve)) - return IRQ_SET_MASK_OK; + if (WARN_ON_ONCE(!irqd_is_activated(irqd))) + return -EIO; raw_spin_lock(&vector_lock); cpumask_and(vector_searchmask, dest, cpu_online_mask); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..2a9fec53e159 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,26 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +344,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { -- cgit v1.2.3 From 5f55dd54994a596ce3bdb9e2a73164907ca46c03 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Jun 2020 14:19:12 +0200 Subject: media: atomisp: move CCK endpoint address to generic header IOSF MBI header contains a lot of definitions, such as end point addresses of IPs. Move CCK address from AtomISP driver to generic header. While here, drop unused one. Signed-off-by: Andy Shevchenko Signed-off-by: Mauro Carvalho Chehab --- arch/x86/include/asm/iosf_mbi.h | 1 + drivers/staging/media/atomisp/pci/atomisp-regs.h | 3 --- drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index 5270ff39b9af..a1911fea8739 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -39,6 +39,7 @@ #define BT_MBI_UNIT_PMC 0x04 #define BT_MBI_UNIT_GFX 0x06 #define BT_MBI_UNIT_SMI 0x0C +#define BT_MBI_UNIT_CCK 0x14 #define BT_MBI_UNIT_USB 0x43 #define BT_MBI_UNIT_SATA 0xA3 #define BT_MBI_UNIT_PCIE 0xA6 diff --git a/drivers/staging/media/atomisp/pci/atomisp-regs.h b/drivers/staging/media/atomisp/pci/atomisp-regs.h index de34ee28e390..022997f47121 100644 --- a/drivers/staging/media/atomisp/pci/atomisp-regs.h +++ b/drivers/staging/media/atomisp/pci/atomisp-regs.h @@ -20,9 +20,6 @@ #define ATOMISP_REGS_H /* common register definitions */ -#define PUNIT_PORT 0x04 -#define CCK_PORT 0x14 - #define PCICMDSTS 0x01 #define INTR 0x0f #define MSI_CAPID 0x24 diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index 29ea66f175c8..3bd78d870264 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1687,7 +1687,7 @@ static int atomisp_pci_probe(struct pci_dev *dev, isp->dfs = &dfs_config_cht; isp->pdev->d3cold_delay = 0; - iosf_mbi_read(CCK_PORT, MBI_REG_READ, CCK_FUSE_REG_0, &val); + iosf_mbi_read(BT_MBI_UNIT_CCK, MBI_REG_READ, CCK_FUSE_REG_0, &val); switch (val & CCK_FUSE_HPLL_FREQ_MASK) { case 0x00: isp->hpll_freq = HPLL_FREQ_800MHZ; -- cgit v1.2.3 From cadfad870154e14f745ec845708bc17d166065f2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 17 Jul 2020 16:53:55 -0700 Subject: x86/ioperm: Fix io bitmap invalidation on Xen PV tss_invalidate_io_bitmap() wasn't wired up properly through the pvop machinery, so the TSS and Xen's io bitmap would get out of sync whenever disabling a valid io bitmap. Add a new pvop for tss_invalidate_io_bitmap() to fix it. This is XSA-329. Fixes: 22fe5b0439dd ("x86/ioperm: Move TSS bitmap update to exit to user work") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Reviewed-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/d53075590e1f91c19f8af705059d3ff99424c020.1595030016.git.luto@kernel.org --- arch/x86/include/asm/io_bitmap.h | 16 ++++++++++++++++ arch/x86/include/asm/paravirt.h | 5 +++++ arch/x86/include/asm/paravirt_types.h | 1 + arch/x86/kernel/paravirt.c | 3 ++- arch/x86/kernel/process.c | 18 ++---------------- arch/x86/xen/enlighten_pv.c | 12 ++++++++++++ 6 files changed, 38 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index ac1a99ffbd8d..7f080f5c7def 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -19,12 +19,28 @@ struct task_struct; void io_bitmap_share(struct task_struct *tsk); void io_bitmap_exit(struct task_struct *tsk); +static inline void native_tss_invalidate_io_bitmap(void) +{ + /* + * Invalidate the I/O bitmap by moving io_bitmap_base outside the + * TSS limit so any subsequent I/O access from user space will + * trigger a #GP. + * + * This is correct even when VMEXIT rewrites the TSS limit + * to 0x67 as the only requirement is that the base points + * outside the limit. + */ + this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base, + IO_BITMAP_OFFSET_INVALID); +} + void native_tss_update_io_bitmap(void); #ifdef CONFIG_PARAVIRT_XXL #include #else #define tss_update_io_bitmap native_tss_update_io_bitmap +#define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap #endif #else diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5ca5d297df75..3d2afecde50c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -302,6 +302,11 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) } #ifdef CONFIG_X86_IOPL_IOPERM +static inline void tss_invalidate_io_bitmap(void) +{ + PVOP_VCALL0(cpu.invalidate_io_bitmap); +} + static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 732f62e04ddb..8dfcb2508e6d 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -141,6 +141,7 @@ struct pv_cpu_ops { void (*load_sp0)(unsigned long sp0); #ifdef CONFIG_X86_IOPL_IOPERM + void (*invalidate_io_bitmap)(void); void (*update_io_bitmap)(void); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 674a7d66d960..de2138ba38e5 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -324,7 +324,8 @@ struct paravirt_patch_template pv_ops = { .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM - .cpu.update_io_bitmap = native_tss_update_io_bitmap, + .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, + .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f362ce0d5ac0..fe67dbd76e51 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -322,20 +322,6 @@ void arch_setup_new_exec(void) } #ifdef CONFIG_X86_IOPL_IOPERM -static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) -{ - /* - * Invalidate the I/O bitmap by moving io_bitmap_base outside the - * TSS limit so any subsequent I/O access from user space will - * trigger a #GP. - * - * This is correct even when VMEXIT rewrites the TSS limit - * to 0x67 as the only requirement is that the base points - * outside the limit. - */ - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; -} - static inline void switch_to_bitmap(unsigned long tifp) { /* @@ -346,7 +332,7 @@ static inline void switch_to_bitmap(unsigned long tifp) * user mode. */ if (tifp & _TIF_IO_BITMAP) - tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); + tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) @@ -380,7 +366,7 @@ void native_tss_update_io_bitmap(void) u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { - tss_invalidate_io_bitmap(tss); + native_tss_invalidate_io_bitmap(); return; } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 0d68948c82ad..c46b9f2e732f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -870,6 +870,17 @@ static void xen_load_sp0(unsigned long sp0) } #ifdef CONFIG_X86_IOPL_IOPERM +static void xen_invalidate_io_bitmap(void) +{ + struct physdev_set_iobitmap iobitmap = { + .bitmap = 0, + .nr_ports = 0, + }; + + native_tss_invalidate_io_bitmap(); + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap); +} + static void xen_update_io_bitmap(void) { struct physdev_set_iobitmap iobitmap; @@ -1099,6 +1110,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM + .invalidate_io_bitmap = xen_invalidate_io_bitmap, .update_io_bitmap = xen_update_io_bitmap, #endif .io_delay = xen_io_delay, -- cgit v1.2.3 From 58ac3154b83938515129c20aa76d456a4c9202a8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 26 Jun 2020 13:34:25 -0700 Subject: x86/entry: Actually disable stack protector Some builds of GCC enable stack protector by default. Simply removing the arguments is not sufficient to disable stack protector, as the stack protector for those GCC builds must be explicitly disabled. Remove the argument removals and add -fno-stack-protector. Additionally include missed x32 argument updates, and adjust whitespace for readability. Fixes: 20355e5f73a7 ("x86/entry: Exclude low level entry code from sanitizing") Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/202006261333.585319CA6B@keescook --- arch/x86/entry/Makefile | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b7a5790d8d63..08bf95dbc911 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -7,12 +7,20 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) + +CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_x32.o += -fno-stack-protector CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) + obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o -- cgit v1.2.3 From da05b143a308bd6a7a444401f9732678ae63fc70 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Tue, 14 Jul 2020 23:26:31 -0400 Subject: x86/boot: Don't add the EFI stub to targets vmlinux-objs-y is added to targets, which currently means that the EFI stub gets added to the targets as well. It shouldn't be added since it is built elsewhere. This confuses Makefile.build which interprets the EFI stub as a target $(obj)/$(objtree)/drivers/firmware/efi/libstub/lib.a and will create drivers/firmware/efi/libstub/ underneath arch/x86/boot/compressed, to hold this supposed target, if building out-of-tree. [0] Fix this by pulling the stub out of vmlinux-objs-y into efi-obj-y. [0] See scripts/Makefile.build near the end: # Create directories for object files if they do not exist Signed-off-by: Arvind Sankar Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Acked-by: Ard Biesheuvel Link: https://lkml.kernel.org/r/20200715032631.1562882-1-nivedita@alum.mit.edu --- arch/x86/boot/compressed/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 7619742f91c9..5a828fde7a42 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -90,8 +90,8 @@ endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o +efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a # The compressed kernel is built with -fPIC/-fPIE so that a boot loader # can place it anywhere in memory and it will still run. However, since @@ -115,7 +115,7 @@ endef quiet_cmd_check-and-link-vmlinux = LD $@ cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) -$(obj)/vmlinux: $(vmlinux-objs-y) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S -- cgit v1.2.3 From 5714ee50bb4375bd586858ad800b1d9772847452 Mon Sep 17 00:00:00 2001 From: Kevin Buettner Date: Sat, 18 Jul 2020 00:20:03 -0700 Subject: copy_xstate_to_kernel: Fix typo which caused GDB regression This fixes a regression encountered while running the gdb.base/corefile.exp test in GDB's test suite. In my testing, the typo prevented the sw_reserved field of struct fxregs_state from being output to the kernel XSAVES area. Thus the correct mask corresponding to XCR0 was not present in the core file for GDB to interrogate, resulting in the following behavior: [kev@f32-1 gdb]$ ./gdb -q testsuite/outputs/gdb.base/corefile/corefile testsuite/outputs/gdb.base/corefile/corefile.core Reading symbols from testsuite/outputs/gdb.base/corefile/corefile... [New LWP 232880] warning: Unexpected size of section `.reg-xstate/232880' in core file. With the typo fixed, the test works again as expected. Signed-off-by: Kevin Buettner Fixes: 9e4636545933 ("copy_xstate_to_kernel(): don't leave parts of destination uninitialized") Cc: Al Viro Cc: Dave Airlie Signed-off-by: Linus Torvalds --- arch/x86/kernel/fpu/xstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bda2e5eaca0e..ad3a2b37927d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1074,7 +1074,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of copy_part(offsetof(struct fxregs_state, st_space), 128, &xsave->i387.st_space, &kbuf, &offset_start, &count); if (header.xfeatures & XFEATURE_MASK_SSE) - copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256, + copy_part(xstate_offsets[XFEATURE_SSE], 256, &xsave->i387.xmm_space, &kbuf, &offset_start, &count); /* * Fill xsave->i387.sw_reserved value for ptrace frame: -- cgit v1.2.3 From de2b41be8fcccb2f5b6c480d35df590476344201 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 21 Jul 2020 11:34:48 +0200 Subject: x86, vmlinux.lds: Page-align end of ..page_aligned sections On x86-32 the idt_table with 256 entries needs only 2048 bytes. It is page-aligned, but the end of the .bss..page_aligned section is not guaranteed to be page-aligned. As a result, objects from other .bss sections may end up on the same 4k page as the idt_table, and will accidentially get mapped read-only during boot, causing unexpected page-faults when the kernel writes to them. This could be worked around by making the objects in the page aligned sections page sized, but that's wrong. Explicit sections which store only page aligned objects have an implicit guarantee that the object is alone in the page in which it is placed. That works for all objects except the last one. That's inconsistent. Enforcing page sized objects for these sections would wreckage memory sanitizers, because the object becomes artificially larger than it should be and out of bound access becomes legit. Align the end of the .bss..page_aligned and .data..page_aligned section on page-size so all objects places in these sections are guaranteed to have their own page. [ tglx: Amended changelog ] Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200721093448.10417-1-joro@8bytes.org --- arch/x86/kernel/vmlinux.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3bfc8dd8a43d..9a03e5b23135 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -358,6 +358,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index db600ef218d7..052e0f05a984 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -341,7 +341,8 @@ #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ - *(.data..page_aligned) + *(.data..page_aligned) \ + . = ALIGN(page_align); #define READ_MOSTLY_DATA(align) \ . = ALIGN(align); \ @@ -737,7 +738,9 @@ . = ALIGN(bss_align); \ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { \ BSS_FIRST_SECTIONS \ + . = ALIGN(PAGE_SIZE); \ *(.bss..page_aligned) \ + . = ALIGN(PAGE_SIZE); \ *(.dynbss) \ *(BSS_MAIN) \ *(COMMON) \ -- cgit v1.2.3 From 372a8eaa05998cd45b3417d0e0ffd3a70978211a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jul 2020 09:04:25 -0500 Subject: x86/unwind/orc: Fix ORC for newly forked tasks The ORC unwinder fails to unwind newly forked tasks which haven't yet run on the CPU. It correctly reads the 'ret_from_fork' instruction pointer from the stack, but it incorrectly interprets that value as a call stack address rather than a "signal" one, so the address gets incorrectly decremented in the call to orc_find(), resulting in bad ORC data. Fix it by forcing 'ret_from_fork' frames to be signal frames. Reported-by: Wang ShaoBo Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Wang ShaoBo Link: https://lkml.kernel.org/r/f91a8778dde8aae7f71884b5df2b16d552040441.1594994374.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 7f969b2d240f..ec88bbe08a32 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -440,8 +440,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc) { @@ -662,6 +665,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp; state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, -- cgit v1.2.3 From 039a7a30ec102ec866d382a66f87f6f7654f8140 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 17 Jul 2020 09:04:26 -0500 Subject: x86/stacktrace: Fix reliable check for empty user task stacks If a user task's stack is empty, or if it only has user regs, ORC reports it as a reliable empty stack. But arch_stack_walk_reliable() incorrectly treats it as unreliable. That happens because the only success path for user tasks is inside the loop, which only iterates on non-empty stacks. Generally, a user task must end in a user regs frame, but an empty stack is an exception to that rule. Thanks to commit 71c95825289f ("x86/unwind/orc: Fix error handling in __unwind_start()"), unwind_start() now sets state->error appropriately. So now for both ORC and FP unwinders, unwind_done() and !unwind_error() always means the end of the stack was successfully reached. So the success path for kthreads is no longer needed -- it can also be used for empty user tasks. Reported-by: Wang ShaoBo Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Wang ShaoBo Link: https://lkml.kernel.org/r/f136a4e5f019219cbc4f4da33b30c2f44fa65b84.1594994374.git.jpoimboe@redhat.com --- arch/x86/kernel/stacktrace.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6ad43fc44556..2fd698e28e4d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * or a page fault), which can make frame pointers * unreliable. */ - if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } @@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (unwind_error(&state)) return -EINVAL; - /* Success path for non-user tasks, i.e. kthreads and idle tasks */ - if (!(task->flags & (PF_KTHREAD | PF_IDLE))) - return -EINVAL; - return 0; } -- cgit v1.2.3 From d181d2da0141371bbc360eaea78719203e165e1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Jul 2020 10:39:54 +0200 Subject: x86/dumpstack: Dump user space code correctly again H.J. reported that post 5.7 a segfault of a user space task does not longer dump the Code bytes when /proc/sys/debug/exception-trace is enabled. It prints 'Code: Bad RIP value.' instead. This was broken by a recent change which made probe_kernel_read() reject non-kernel addresses. Update show_opcodes() so it retrieves user space opcodes via copy_from_user_nmi(). Fixes: 98a23609b103 ("maccess: always use strict semantics for probe_kernel_read") Reported-by: H.J. Lu Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87h7tz306w.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/dumpstack.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b037cfa7c0c5..7401cc12c3cc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -71,6 +71,22 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* + * Make sure userspace isn't trying to trick us into dumping kernel + * memory by pointing the userspace instruction pointer at it. + */ + if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) + return -EINVAL; + + return copy_from_user_nmi(buf, (void __user *)src, nbytes); +} + /* * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: * @@ -97,17 +113,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) #define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) u8 opcodes[OPCODE_BUFSIZE]; unsigned long prologue = regs->ip - PROLOGUE_SIZE; - bool bad_ip; - - /* - * Make sure userspace isn't trying to trick us into dumping kernel - * memory by pointing the userspace instruction pointer at it. - */ - bad_ip = user_mode(regs) && - __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX); - if (bad_ip || copy_from_kernel_nofault(opcodes, (u8 *)prologue, - OPCODE_BUFSIZE)) { + if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { printk("%sCode: Bad RIP value.\n", loglvl); } else { printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" -- cgit v1.2.3