diff options
61 files changed, 582 insertions, 374 deletions
diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt index af0c9a4c65a6..cd4b29be29af 100644 --- a/Documentation/x86/orc-unwinder.txt +++ b/Documentation/x86/orc-unwinder.txt @@ -4,7 +4,7 @@ ORC unwinder Overview -------- -The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is +The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. @@ -933,8 +933,8 @@ ifdef CONFIG_STACK_VALIDATION ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - ifdef CONFIG_ORC_UNWINDER - $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_UNWINDER_ORC + $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") else $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 32779beb56e2..9ab7629feac7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -170,7 +170,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71a48a30fc84..a4ff214fb760 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -358,28 +358,14 @@ config PUNIT_ATOM_DEBUG choice prompt "Choose kernel unwinder" - default FRAME_POINTER_UNWINDER + default UNWINDER_ORC if X86_64 + default UNWINDER_FRAME_POINTER if X86_32 ---help--- This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, livepatch, lockdep, and more. -config FRAME_POINTER_UNWINDER - bool "Frame pointer unwinder" - select FRAME_POINTER - ---help--- - This option enables the frame pointer unwinder for unwinding kernel - stack traces. - - The unwinder itself is fast and it uses less RAM than the ORC - unwinder, but the kernel text size will grow by ~3% and the kernel's - overall performance will degrade by roughly 5-10%. - - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - -config ORC_UNWINDER +config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 select STACK_VALIDATION @@ -395,7 +381,22 @@ config ORC_UNWINDER Enabling this option will increase the kernel's runtime memory usage by roughly 2-4MB, depending on your kernel config. -config GUESS_UNWINDER +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT ---help--- @@ -410,7 +411,7 @@ config GUESS_UNWINDER endchoice config FRAME_POINTER - depends on !ORC_UNWINDER && !GUESS_UNWINDER + depends on !UNWINDER_ORC && !UNWINDER_GUESS bool endmenu diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 550cd5012b73..66c9e2aab16c 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set -CONFIG_GUESS_UNWINDER=y -# CONFIG_FRAME_POINTER_UNWINDER is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4a4b16e56d35..e32fc1f274d8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 3a2dc3dc6cac..f3cd26f48332 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -45,7 +45,7 @@ ENTRY(chacha20_8block_xor_avx2) vzeroupper # 4 * 32 byte stack, 32-byte aligned - mov %rsp, %r8 + lea 8(%rsp),%r10 and $~31, %rsp sub $0x80, %rsp @@ -443,6 +443,6 @@ ENTRY(chacha20_8block_xor_avx2) vmovdqu %ymm15,0x01e0(%rsi) vzeroupper - mov %r8,%rsp + lea -8(%r10),%rsp ret ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 3f511a7d73b8..512a2b500fd1 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -160,7 +160,7 @@ ENTRY(chacha20_4block_xor_ssse3) # done with the slightly better performing SSSE3 byte shuffling, # 7/12-bit word rotation uses traditional shift+OR. - mov %rsp,%r11 + lea 8(%rsp),%r10 sub $0x80,%rsp and $~63,%rsp @@ -625,6 +625,6 @@ ENTRY(chacha20_4block_xor_ssse3) pxor %xmm1,%xmm15 movdqu %xmm15,0xf0(%rsi) - mov %r11,%rsp + lea -8(%r10),%rsp ret ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 640aafebdc00..1895a685d3dd 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -141,56 +141,25 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - UNWIND_HINT_IRET_REGS offset=16*8 - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .endm + + .macro POP_C_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi .endm .macro icebp diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6cdb7a1455e..84263c79a119 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp + addq $6*8, %rsp /* skip extra regs -- they were preserved */ UNWIND_HINT_EMPTY - USERGS_SYSRET64 + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -245,17 +244,18 @@ entry_SYSCALL64_slow_path: call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -273,14 +273,14 @@ return_from_SYSCALL_64: /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -301,12 +301,12 @@ return_from_SYSCALL_64: * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding @@ -314,14 +314,20 @@ return_from_SYSCALL_64: */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp UNWIND_HINT_EMPTY + POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi + popq %rdi + movq RSP-ORIG_RAX(%rsp), %rsp USERGS_SYSRET64 - -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -422,8 +428,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ @@ -611,8 +616,21 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + +GLOBAL(swapgs_restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif SWAPGS - jmp restore_regs_and_iret + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -632,15 +650,17 @@ retint_kernel: */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) - RESTORE_EXTRA_REGS -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testb $3, CS(%rsp) + jz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) @@ -817,7 +837,7 @@ ENTRY(\sym) ASM_CLAC - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif @@ -1058,6 +1078,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 #endif @@ -1111,17 +1132,14 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1222,10 +1240,13 @@ ENTRY(error_exit) jmp retint_user END(error_exit) -/* Runs on exception stack */ -/* XXX: broken on Xen PV */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + */ ENTRY(nmi) UNWIND_HINT_IRET_REGS + /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1283,7 +1304,7 @@ ENTRY(nmi) * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1327,8 +1348,7 @@ ENTRY(nmi) * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* @@ -1449,7 +1469,7 @@ nested_nmi_out: popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1480,7 +1500,7 @@ first_nmi: pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: #endif @@ -1543,29 +1563,34 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e26c25ca7756..932b96ce1b06 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) ENTRY(stub32_clone) diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 5b0579abb398..3ac991d81e74 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG "\n\t" + asm volatile(RDRAND_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT "\n\t" + asm volatile(RDRAND_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) static inline bool rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG "\n\t" + asm volatile(RDSEED_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; @@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) static inline bool rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT "\n\t" + asm volatile(RDSEED_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 854022772c5b..8cee8db6dffb 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) : CC_OUT(s) (negative), ADDR : "ir" ((char) ~(1 << nr)) : "memory"); @@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * { bool oldbit; - asm("bts %2,%1\n\t" + asm("bts %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long { bool oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile("btr %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon { bool oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile("btc %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l { bool oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile("bt %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5343c19814b3..948b6d8ec46f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include <linux/types.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d59c15c3defd..225fd8374fae 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2519c6c801c9..b0556f882aa8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -21,6 +21,11 @@ * this feature bit is not displayed in /proc/cpuinfo at all. */ +/* + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c + */ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ @@ -294,6 +299,12 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eb7c718aaf8..9f05a1002aa9 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,7 +5,7 @@ #include <asm/orc_types.h> struct mod_arch_specific { -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 12deec722cf0..43d4f90edebc 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,10 +15,9 @@ #include <linux/cpumask.h> #include <asm/frame.h> -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 280d94c36dad..a916788ac478 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -133,7 +133,7 @@ struct pv_cpu_ops { void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 9fa03604b2b3..b21a475fd7ed 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("bt "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b390ff76e58f..f10dae14f951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -430,7 +430,9 @@ typedef struct { struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -517,16 +519,9 @@ static inline void native_set_iopl_mask(unsigned mask) } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -546,15 +541,20 @@ static inline unsigned long current_top_of_stack(void) #endif } +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer) < THREAD_SIZE; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask @@ -803,6 +803,15 @@ static inline void spin_lock_prefetch(const void *x) #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) +#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -822,23 +831,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else @@ -872,11 +864,9 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 91c04c8e67fa..e2afbf689309 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -135,9 +135,9 @@ static inline int v8086_mode(struct pt_regs *regs) #endif } -#ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { +#ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 @@ -148,8 +148,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif +#else /* !CONFIG_X86_64 */ + return false; +#endif } +#ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 045f99211a99..0c411c8bbdbd 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -28,7 +28,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ - asm volatile (fullop ";" CC_SET(cc) \ + asm volatile (fullop CC_SET(cc) \ : [counter] "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ return c; \ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index fcc5cd387fd1..010cd6e4eafc 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include <linux/sched/task_stack.h> + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -72,4 +74,26 @@ do { \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ +#ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); +#else + load_sp0(task_top_of_stack(task)); +#endif +} + #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 91dfcafe27a6..bad25bb80679 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 39f7a27bef13..6b086c39e6dc 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -33,11 +33,6 @@ DECLARE_EVENT_CLASS(x86_fpu, ) ); -DEFINE_EVENT(x86_fpu, x86_fpu_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -73,11 +68,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 5545f6459bf5..e76ce80ca18b 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -37,9 +37,9 @@ asmlinkage void simd_coprocessor_error(void); #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) asmlinkage void xen_divide_error(void); +asmlinkage void xen_xennmi(void); asmlinkage void xen_xendebug(void); asmlinkage void xen_xenint3(void); -asmlinkage void xen_nmi(void); asmlinkage void xen_overflow(void); asmlinkage void xen_bounds(void); asmlinkage void xen_invalid_op(void); @@ -144,4 +144,22 @@ enum { X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9f793e2df7a..35d67dc7b69f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,11 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#if defined(CONFIG_ORC_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs; -#elif defined(CONFIG_FRAME_POINTER_UNWINDER) +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; @@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, __unwind_start(state, task, regs, first_frame); } -#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) } #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 185f3d10c194..39946d0a1d41 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -151,5 +151,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a97a6b611531..d449c5a9db96 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -27,7 +27,6 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y @@ -128,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e17942c131c8..de260fae1017 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,6 +22,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..cdf79ab628c2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, @@ -1572,9 +1570,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1596,7 +1598,6 @@ void cpu_init(void) int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); @@ -1627,9 +1628,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ set_tss_desc(cpu, t); load_TR_desc(); + load_mm_ldt(&init_mm); t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..904b0a3c4e53 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,121 @@ +/* Declare dependencies between CPUIDs */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <asm/cpufeature.h> + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include <asm/fpu/xstate.h> #include <asm/tlbflush.h> +#include <asm/cpufeature.h> /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9ed3074d0d27..62474df4e3e7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -211,9 +211,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 @@ -401,7 +398,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2be7d1e7fcf1..c03e02a83c4e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -50,6 +50,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -89,6 +90,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -133,6 +135,7 @@ ENTRY(secondary_startup_64) movq $1f, %rax jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -150,9 +153,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 @@ -235,7 +235,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -247,6 +247,7 @@ ENDPROC(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -266,26 +267,24 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) + UNWIND_HINT_IRET_REGS offset=16 +END(early_idt_handler_array) early_idt_handler_common: /* @@ -313,6 +312,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -327,8 +327,8 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) + jmp restore_regs_and_return_to_kernel +END(early_idt_handler_common) __INITDATA @@ -435,7 +435,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index f0e64db18ac8..0402d44deb4d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> @@ -294,8 +295,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bd6b85fac666..ff8a9acbcf8b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -48,7 +48,13 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..45bf0c5f93e1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ - load_sp0(tss, next); + update_sp0(next_p); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..eeeb34f85c25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; @@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* Reload esp0 and ss1. This changes current_thread_info(). */ - load_sp0(tss, next); + /* Reload sp0. */ + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..06c18fe1c09e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 67db4f43309e..42a9c4458f5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 7924a5356c8a..a7b44c75c642 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -54,6 +54,7 @@ #include <asm/irq.h> #include <asm/traps.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * Known problems: @@ -93,7 +94,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -145,12 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + update_sp0(tsk); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -236,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -364,15 +364,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } - load_sp0(tss, &tsk->thread); - put_cpu(); + update_sp0(tsk); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e2baeaa053a5..db71c73530bd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -29,26 +29,6 @@ #include <asm/trace/exceptions.h> /* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - - PF_PROT = 1 << 0, - PF_WRITE = 1 << 1, - PF_USER = 1 << 2, - PF_RSVD = 1 << 3, - PF_INSTR = 1 << 4, - PF_PK = 1 << 5, -}; - -/* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ @@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); @@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * siginfo so userspace can discover which protection key was set * on the PTE. * - * If we get here, we know that the hardware signaled a PF_PK + * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. @@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) /* * force_sig_info_fault() is called from a number of * contexts, some of which have a VMA and some of which - * do not. The PF_PK handing happens after we have a + * do not. The X86_PF_PK handing happens after we have a * valid VMA, so we should never reach this without a * valid VMA. */ @@ -697,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (!oops_may_print()) return; - if (error_code & PF_INSTR) { + if (error_code & X86_PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; @@ -779,7 +759,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, */ if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; - tsk->thread.error_code = error_code | PF_USER; + tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ @@ -897,7 +877,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * It's possible to have interrupts off here: */ @@ -918,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * Instruction fetch faults in the vsyscall page might need * emulation. */ - if (unlikely((error_code & PF_INSTR) && + if (unlikely((error_code & X86_PF_INSTR) && ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; @@ -931,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * are always protection faults. */ if (address >= TASK_SIZE_MAX) - error_code |= PF_PROT; + error_code |= X86_PF_PROT; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -992,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, if (!boot_cpu_has(X86_FEATURE_OSPKE)) return false; - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return true; return false; } @@ -1024,7 +1004,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -1052,14 +1032,14 @@ static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, unsigned int fault) { - if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) { + if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; @@ -1084,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, static int spurious_fault_check(unsigned long error_code, pte_t *pte) { - if ((error_code & PF_WRITE) && !pte_write(*pte)) + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; - if ((error_code & PF_INSTR) && !pte_exec(*pte)) + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; /* * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set PF_PK. + * changes, so no spurious fault will ever set X86_PF_PK. */ - if ((error_code & PF_PK)) + if ((error_code & X86_PF_PK)) return 1; return 1; @@ -1139,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address) * change, so user accesses are not expected to cause spurious * faults. */ - if (error_code != (PF_WRITE | PF_PROT) - && error_code != (PF_INSTR | PF_PROT)) + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); @@ -1200,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ - if (error_code & PF_PK) + if (error_code & X86_PF_PK) return 1; /* * Make sure to check the VMA so that we do not perform - * faults just to hit a PF_PK as soon as we fill in a + * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), - (error_code & PF_INSTR), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) return 1; - if (error_code & PF_WRITE) { + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -1220,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) } /* read, present: */ - if (unlikely(error_code & PF_PROT)) + if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ @@ -1243,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (!static_cpu_has(X86_FEATURE_SMAP)) return false; - if (error_code & PF_USER) + if (error_code & X86_PF_USER) return false; if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) @@ -1296,7 +1276,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; @@ -1324,7 +1304,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & PF_RSVD)) + if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { @@ -1350,7 +1330,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= PF_USER; + error_code |= X86_PF_USER; flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1359,9 +1339,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* @@ -1381,7 +1361,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if ((error_code & PF_USER) == 0 && + if (!(error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address, NULL); return; @@ -1408,7 +1388,7 @@ retry: bad_area(regs, error_code, address); return; } - if (error_code & PF_USER) { + if (error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 836a1eb5df43..3ee234b6234d 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <os.h> @@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) mm->arch.ldt.entry_count = 0; } -int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { - return do_modify_ldt_skas(func, ptr, bytecount); + /* See non-um modify_ldt() for why we do this cast */ + return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69b9deff7e5c..e7b213047724 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { #ifdef CONFIG_X86_MCE { machine_check, xen_machine_check, true }, #endif - { nmi, xen_nmi, true }, + { nmi, xen_xennmi, true }, { overflow, xen_overflow, false }, #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, @@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, } } -static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static void xen_load_sp0(unsigned long sp0) { struct multicall_space mcs; mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - tss->x86_tss.sp0 = thread->sp0; + this_cpu_write(cpu_tss.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 51471408fdd1..8c0e047d0b80 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -13,6 +13,7 @@ * single-threaded. */ #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/smp.h> @@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + /* + * Bring up the CPU in cpu_bringup_and_idle() with the stack + * pointing just below where pt_regs would be if it were a normal + * kernel entry. + */ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); xen_copy_trap_info(ctxt->trap_ctxt); @@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; + /* + * Set SS:SP that Xen will use when entering guest kernel mode + * from guest user mode. Subsequent calls to load_sp0() can + * change this value. + */ ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_sp = task_top_of_stack(idle); #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; @@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) BUG(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index dae2cc33afb5..286ecc198562 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -29,7 +29,7 @@ xen_pv_trap debug xen_pv_trap xendebug xen_pv_trap int3 xen_pv_trap xenint3 -xen_pv_trap nmi +xen_pv_trap xennmi xen_pv_trap overflow xen_pv_trap bounds xen_pv_trap invalid_op diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a7525e95d53f..124941d09b2b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -9,6 +9,7 @@ #include <asm/boot.h> #include <asm/asm.h> #include <asm/page_types.h> +#include <asm/unwind_hints.h> #include <xen/interface/elfnote.h> #include <xen/interface/features.h> @@ -19,6 +20,7 @@ #ifdef CONFIG_XEN_PV __INIT ENTRY(startup_xen) + UNWIND_HINT_EMPTY cld /* Clear .bss */ @@ -33,21 +35,24 @@ ENTRY(startup_xen) mov $init_thread_union+THREAD_SIZE, %_ASM_SP jmp xen_start_kernel - +END(startup_xen) __FINIT #endif .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE + .rept (PAGE_SIZE / 32) + UNWIND_HINT_EMPTY + .skip 32 + .endr #define HYPERCALL(n) \ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 #include <asm/xen-hypercalls.h> #undef HYPERCALL - +END(hypercall_page) .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 3ba24c428c3b..2fae850a3eff 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -214,7 +214,9 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); - if (err || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + if (err || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index f905f2361d12..8bae88a150fd 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -146,11 +146,8 @@ renesas_sdhi_internal_dmac_start_dma(struct tmio_mmc_host *host, WARN_ON(host->sg_len > 1); /* This DMAC cannot handle if buffer is not 8-bytes alignment */ - if (!IS_ALIGNED(sg->offset, 8)) { - host->force_pio = true; - renesas_sdhi_internal_dmac_enable_dma(host, false); - return; - } + if (!IS_ALIGNED(sg->offset, 8)) + goto force_pio; if (data->flags & MMC_DATA_READ) { dtran_mode |= DTRAN_MODE_CH_NUM_CH1; @@ -163,8 +160,8 @@ renesas_sdhi_internal_dmac_start_dma(struct tmio_mmc_host *host, } ret = dma_map_sg(&host->pdev->dev, sg, host->sg_len, dir); - if (ret < 0) - return; + if (ret == 0) + goto force_pio; renesas_sdhi_internal_dmac_enable_dma(host, true); @@ -176,6 +173,12 @@ renesas_sdhi_internal_dmac_start_dma(struct tmio_mmc_host *host, dtran_mode); renesas_sdhi_internal_dmac_dm_write(host, DM_DTRAN_ADDR, sg->dma_address); + + return; + +force_pio: + host->force_pio = true; + renesas_sdhi_internal_dmac_enable_dma(host, false); } static void renesas_sdhi_internal_dmac_issue_tasklet_fn(unsigned long arg) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index a7293e186e03..9c4e6199b854 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -47,6 +47,7 @@ #include <linux/mmc/sdio.h> #include <linux/scatterlist.h> #include <linux/spinlock.h> +#include <linux/swiotlb.h> #include <linux/workqueue.h> #include "tmio_mmc.h" @@ -1215,6 +1216,18 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host, mmc->max_blk_count = pdata->max_blk_count ? : (PAGE_SIZE / mmc->max_blk_size) * mmc->max_segs; mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count; + /* + * Since swiotlb has memory size limitation, this will calculate + * the maximum size locally (because we don't have any APIs for it now) + * and check the current max_req_size. And then, this will update + * the max_req_size if needed as a workaround. + */ + if (swiotlb_max_segment()) { + unsigned int max_size = (1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; + + if (mmc->max_req_size > max_size) + mmc->max_req_size = max_size; + } mmc->max_seg_size = mmc->max_req_size; _host->native_hotplug = !(pdata->flags & TMIO_MMC_USE_GPIO_CD || diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8acfc1e099e1..63e56f6c1877 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -687,7 +687,7 @@ #define BUG_TABLE #endif -#ifdef CONFIG_ORC_UNWINDER +#ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 6737a8c9e8c6..d68b0569a5eb 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -175,7 +175,8 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev) static inline s32 dev_pm_qos_raw_read_value(struct device *dev) { return IS_ERR_OR_NULL(dev->power.qos) ? - 0 : pm_qos_read_value(&dev->power.qos->resume_latency); + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : + pm_qos_read_value(&dev->power.qos->resume_latency); } #else static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dfdad67d8f6c..ff21b4dbb392 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -376,7 +376,7 @@ config STACK_VALIDATION that runtime stack traces are more reliable. This is also a prerequisite for generation of ORC unwind data, which - is needed for CONFIG_ORC_UNWINDER. + is needed for CONFIG_UNWINDER_ORC. For more information, see tools/objtool/Documentation/stack-validation.txt. diff --git a/lib/ioremap.c b/lib/ioremap.c index 4bb30206b942..c835f9080c43 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -161,6 +161,7 @@ int ioremap_page_range(unsigned long addr, unsigned long next; int err; + might_sleep(); BUG_ON(addr >= end); start = addr; diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 061d0c3a420a..f965f477832e 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) __objtool_obj := $(objtree)/tools/objtool/objtool -objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) +objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) ifndef CONFIG_FRAME_POINTER objtool_args += --no-fp diff --git a/tools/objtool/check.c b/tools/objtool/check.c index c0e26ad1fa7e..9b341584eb1b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1757,11 +1757,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, if (insn->dead_end) return 0; - insn = next_insn; - if (!insn) { + if (!next_insn) { + if (state.cfa.base == CFI_UNDEFINED) + return 0; WARN("%s: unexpected end of section", sec->name); return 1; } + + insn = next_insn; } return 0; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 31e0f9143840..07f329919828 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -70,7 +70,7 @@ static void cmd_usage(void) printf("\n"); - exit(1); + exit(129); } static void handle_options(int *argc, const char ***argv) @@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) break; } else { fprintf(stderr, "Unknown option: %s\n", cmd); - fprintf(stderr, "\n Usage: %s\n", - objtool_usage_string); - exit(1); + cmd_usage(); } (*argv)++; |