diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/entry/common.c | 49 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 5 | ||||
-rw-r--r-- | arch/x86/entry/entry_64_compat.S | 31 | ||||
-rw-r--r-- | arch/x86/include/asm/fpu/internal.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/idtentry.h | 47 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/core.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 35 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 14 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pv.c | 28 | ||||
-rw-r--r-- | arch/x86/xen/xen-asm_64.S | 25 |
12 files changed, 196 insertions, 64 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bd3f14175193..e83b3f14897c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,32 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> +/* Check that the stack and regs on entry from user mode are sane. */ +static void check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} + #ifdef CONFIG_CONTEXT_TRACKING /** * enter_from_user_mode - Establish state when coming from user mode @@ -127,9 +153,6 @@ static long syscall_trace_enter(struct pt_regs *regs) unsigned long ret = 0; u32 work; - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags); if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { @@ -346,6 +369,8 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -409,6 +434,8 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { + check_user_regs(regs); + enter_from_user_mode(); instrumentation_begin(); @@ -460,6 +487,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) vdso_image_32.sym_int80_landing_pad; bool success; + check_user_regs(regs); + /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. @@ -510,6 +539,18 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} #endif SYSCALL_DEFINE0(ni_syscall) @@ -553,6 +594,7 @@ SYSCALL_DEFINE0(ni_syscall) bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) { if (user_mode(regs)) { + check_user_regs(regs); enter_from_user_mode(); return false; } @@ -686,6 +728,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) */ void noinstr idtentry_enter_user(struct pt_regs *regs) { + check_user_regs(regs); enter_from_user_mode(); } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 024d7d276cd4..2d0bd5d5f032 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -933,9 +933,8 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushl $0 /* pt_regs->sp (placeholder) */ pushfl /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ @@ -965,7 +964,7 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_flags_fixed: movl %esp, %eax - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0f974ae01e62..541fdaf64045 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,29 +57,30 @@ SYM_CODE_START(entry_SYSENTER_compat) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp (stashed in bp) */ + pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* * Push flags. This is nasty. First, interrupts are currently - * off, but we need pt_regs->flags to have IF set. Second, even - * if TF was set when SYSENTER started, it's clear by now. We fix - * that later using TIF_SINGLESTEP. + * off, but we need pt_regs->flags to have IF set. Second, if TS + * was set in usermode, it's still set, and we're singlestepping + * through this code. do_SYSENTER_32() will fix up IF. */ pushfq /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ +SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -135,7 +136,7 @@ SYM_CODE_START(entry_SYSENTER_compat) .Lsysenter_flags_fixed: movq %rsp, %rdi - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42159f45bf9c..845e7481ab77 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -623,6 +623,11 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * MXCSR and XCR definitions: */ +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + extern unsigned int mxcsr_feature_mask; #define XCR_XFEATURE_ENABLED_MASK 0x00000000 diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cf51c50eb356..eeac6dc2adaa 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -353,10 +353,6 @@ static __always_inline void __##func(struct pt_regs *regs) #else /* CONFIG_X86_64 */ -/* Maps to a regular IDTENTRY on 32bit for now */ -# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY -# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY - /** * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant * @vector: Vector number (ignored for C) @@ -387,28 +383,18 @@ __visible noinstr void func(struct pt_regs *regs, \ #endif /* !CONFIG_X86_64 */ /* C-Code mapping */ +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW + +#ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST -#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW -#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW - #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST - -/** - * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points - * @vector: Vector number (ignored for C) - * @func: Function name of the entry point - * - * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM - * indirection magic. - */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - asmlinkage void xen_asm_exc_xen##func(void); \ - asmlinkage void asm_exc_xen##func(void) +#endif #else /* !__ASSEMBLY__ */ @@ -455,9 +441,6 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) -# define DECLARE_IDTENTRY_DEBUG(vector, func) \ - DECLARE_IDTENTRY(vector, func) - /* No ASM emitted for DF as this goes through a C shim */ # define DECLARE_IDTENTRY_DF(vector, func) @@ -469,10 +452,6 @@ __visible noinstr void func(struct pt_regs *regs, \ /* No ASM code emitted for NMI */ #define DECLARE_IDTENTRY_NMI(vector, func) -/* XEN NMI and DB wrapper */ -#define DECLARE_IDTENTRY_XEN(vector, func) \ - idtentry vector asm_exc_xen##func exc_##func has_error_code=0 - /* * ASM code to emit the common vector entry stubs where each stub is * packed into 8 bytes. @@ -565,16 +544,28 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); #ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); +#endif #endif /* NMI */ DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); -DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); +#endif /* #DB */ +#ifdef CONFIG_X86_64 DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); -DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); +#else +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug); +#endif +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); +#endif /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c25a67a34bd3..0ab48f1cdf84 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -50,6 +50,13 @@ static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; /* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +/* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists * CPU models in which having conflicting memory types still leads to @@ -1071,7 +1078,8 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { - split_lock_verify_msr(sld_state != sld_off); + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); } static void split_lock_warn(unsigned long ip) @@ -1177,5 +1185,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) return; } + cpu_model_supports_sld = true; split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index fbe89a92ff36..14e4b4d17ee5 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1901,6 +1901,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + WARN_ON_ONCE(user_mode(regs)); + /* * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. @@ -1954,7 +1956,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) } #else /* 32bit unified entry point */ -DEFINE_IDTENTRY_MCE(exc_machine_check) +DEFINE_IDTENTRY_RAW(exc_machine_check) { unsigned long dr7; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 06c818967bb6..15247b96c6ea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,6 +101,12 @@ void kernel_fpu_begin(void) copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); + + if (boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8748321c4486..34e918ad34d4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -29,6 +29,8 @@ #include <asm/mmu_context.h> #include <asm/pgtable_areas.h> +#include <xen/xen.h> + /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) @@ -543,6 +545,37 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) return bytecount; } +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + static DEFINE_MUTEX(xen_warning); + static bool warned; + + mutex_lock(&xen_warning); + if (!warned) { + pr_info("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + warned = true; + } + mutex_unlock(&xen_warning); + + return false; + } +#endif + + return true; +} + static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; @@ -574,7 +607,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f58679e487f6..b038695f36c5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -870,6 +870,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, trace_hardirqs_off_finish(); /* + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(user_mode(regs)); + + /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ @@ -887,6 +893,12 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + idtentry_enter_user(regs); instrumentation_begin(); @@ -917,7 +929,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) } #else /* 32 bit does not have separate entry points. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6, dr7; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index acc49fa6a097..0d68948c82ad 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -598,6 +598,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, } #ifdef CONFIG_X86_64 +void noist_exc_debug(struct pt_regs *regs); + +DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) +{ + /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + exc_nmi(regs); +} + +DEFINE_IDTENTRY_RAW(xenpv_exc_debug) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -609,18 +629,18 @@ struct trap_array_entry { .xen = xen_asm_##func, \ .ist_okay = ist_ok } -#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ +#define TRAP_ENTRY_REDIR(func, ist_ok) { \ .orig = asm_##func, \ - .xen = xen_asm_##xenfunc, \ + .xen = xen_asm_xenpv_##func, \ .ist_okay = ist_ok } static struct trap_array_entry trap_array[] = { - TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), + TRAP_ENTRY_REDIR(exc_debug, true ), TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE TRAP_ENTRY(exc_machine_check, true ), #endif - TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), + TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 5d252aaeade8..aab1d99b2b48 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,10 +29,9 @@ _ASM_NOKPROBE(xen_\name) .endm xen_pv_trap asm_exc_divide_error -xen_pv_trap asm_exc_debug -xen_pv_trap asm_exc_xendebug +xen_pv_trap asm_xenpv_exc_debug xen_pv_trap asm_exc_int3 -xen_pv_trap asm_exc_xennmi +xen_pv_trap asm_xenpv_exc_nmi xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op @@ -161,10 +160,22 @@ SYM_FUNC_END(xen_syscall32_target) /* 32-bit compat sysenter target */ SYM_FUNC_START(xen_sysenter_target) - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp - jmp entry_SYSENTER_compat + /* + * NB: Xen is polite and clears TF from EFLAGS for us. This means + * that we don't need to guard against single step exceptions here. + */ + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSENTER_compat_after_hwframe SYM_FUNC_END(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ |