diff options
author | Wolfram Sang <wsa@the-dreams.de> | 2018-01-04 00:50:51 +0300 |
---|---|---|
committer | Wolfram Sang <wsa@the-dreams.de> | 2018-01-04 00:50:51 +0300 |
commit | fddfa22a4403cd19548de075ddada0c7c966a232 (patch) | |
tree | f981a63ff0c1062aecb5ab48b0585ffe97a7c20b /arch/x86 | |
parent | 639136d2a70ab9837befb22ad5b3d67cb4db2216 (diff) | |
parent | 0f30aca72c3b68f4b6a443193b574f14106cd61e (diff) | |
download | linux-fddfa22a4403cd19548de075ddada0c7c966a232.tar.xz |
Merge tag 'at24-4.16-updates-for-wolfram' of git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux into i2c/for-4.16
"AT24 updates for 4.16 merge window
The driver has been converted to using regmap instead of raw i2c and
smbus calls which shrank the code significantly.
Device tree binding document has been cleaned up. Device tree support in
the driver has been improved and we now support all at24 models as well
as two new DT properties (no-read-rollover and wp-gpios).
We no longer user unreadable magic values for driver data as the way it
was implemented caused problems for some EEPROM models - we switched to
regular structs.
Aside from that, there's a bunch of coding style fixes and minor
improvements all over the place."
Diffstat (limited to 'arch/x86')
138 files changed, 3224 insertions, 976 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8eed3f94bfc7..d4fc98c50378 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -926,7 +926,8 @@ config MAXSMP config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP - range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK + range 2 64 if SMP && X86_32 && X86_BIGSMP + range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 default "1" if !SMP default "8192" if MAXSMP diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT + depends on !STACKDEPOT ---help--- This option enables the "guess" unwinder for unwinding kernel stack traces. It scans the stack and reports every kernel text address it diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1e9c322e973a..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -305,10 +305,18 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Check if 5-level paging has already enabled */ - movq %cr4, %rax - testl $X86_CR4_LA57, %eax - jnz lvl5 + /* + * Check if we need to enable 5-level paging. + * RSI holds real mode data and need to be preserved across + * a function call. + */ + pushq %rsi + call l5_paging_required + popq %rsi + + /* If l5_paging_required() returned zero, we're done here. */ + cmpq $0, %rax + je lvl5 /* * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value) } } +static bool l5_supported(void) +{ + /* Check if leaf 7 is supported. */ + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} + #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); + if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { + error("This linux kernel as configured requires 5-level paging\n" + "This CPU does not support the required 'cr4.la57' feature\n" + "Unable to boot - please use a kernel appropriate for your CPU\n"); + } + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index d5364ca2e3f9..b5e5e02f8cde 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -23,6 +23,9 @@ */ #undef CONFIG_AMD_MEM_ENCRYPT +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include <asm/processor.h> + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ + /* Check if leaf 7 is supported. */ + + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return 0; + + /* Check if 5-level paging has already been enabled. */ + if (native_read_cr4() & X86_CR4_LA57) + return 0; + + return 1; +} diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 49f4970f693b..6a10d52a4145 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -44,9 +44,9 @@ FDINITRD=$6 # Make sure the files actually exist verify "$FBZIMAGE" -verify "$MTOOLSRC" genbzdisk() { + verify "$MTOOLSRC" mformat a: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - a:syslinux.cfg @@ -57,6 +57,7 @@ genbzdisk() { } genfdimage144() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null mformat v: syslinux $FIMAGE @@ -68,6 +69,7 @@ genfdimage144() { } genfdimage288() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null mformat w: syslinux $FIMAGE @@ -78,39 +80,43 @@ genfdimage288() { mcopy $FBZIMAGE w:linux } -genisoimage() { +geniso() { tmp_dir=`dirname $FIMAGE`/isoimage rm -rf $tmp_dir mkdir $tmp_dir - for i in lib lib64 share end ; do + for i in lib lib64 share ; do for j in syslinux ISOLINUX ; do if [ -f /usr/$i/$j/isolinux.bin ] ; then isolinux=/usr/$i/$j/isolinux.bin - cp $isolinux $tmp_dir fi done for j in syslinux syslinux/modules/bios ; do if [ -f /usr/$i/$j/ldlinux.c32 ]; then ldlinux=/usr/$i/$j/ldlinux.c32 - cp $ldlinux $tmp_dir fi done if [ -n "$isolinux" -a -n "$ldlinux" ] ; then break fi - if [ $i = end -a -z "$isolinux" ] ; then - echo 'Need an isolinux.bin file, please install syslinux/isolinux.' - exit 1 - fi done + if [ -z "$isolinux" ] ; then + echo 'Need an isolinux.bin file, please install syslinux/isolinux.' + exit 1 + fi + if [ -z "$ldlinux" ] ; then + echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' + exit 1 + fi + cp $isolinux $tmp_dir + cp $ldlinux $tmp_dir cp $FBZIMAGE $tmp_dir/linux echo "$KCMDLINE" > $tmp_dir/isolinux.cfg if [ -f "$FDINITRD" ] ; then cp "$FDINITRD" $tmp_dir/initrd.img fi - mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ - -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ - $tmp_dir + genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ + -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ + -boot-info-table $tmp_dir isohybrid $FIMAGE 2>/dev/null || true rm -rf $tmp_dir } @@ -119,6 +125,6 @@ case $1 in bzdisk) genbzdisk;; fdimage144) genfdimage144;; fdimage288) genfdimage288;; - isoimage) genisoimage;; + isoimage) geniso;; *) echo 'Unknown image format'; exit 1; esac diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 399a29d067d6..cb91a64a99e7 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.src.virt.addr, walk.dst.virt.addr, diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3fd8bc560fae..45a63e00a6af 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,6 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/jump_label.h> #include <asm/unwind_hints.h> +#include <asm/cpufeatures.h> +#include <asm/page_types.h> +#include <asm/percpu.h> +#include <asm/asm-offsets.h> +#include <asm/processor-flags.h> /* @@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * halves: + */ +#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT) +#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT)) + +.macro SET_NOFLUSH_BIT reg:req + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg +.endm + +.macro ADJUST_KERNEL_CR3 reg:req + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + andq $(~PTI_SWITCH_MASK), \reg +.endm + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + ADJUST_KERNEL_CR3 \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +#define THIS_CPU_user_pcid_flush_mask \ + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * Test if the ASID needs a flush. + */ + movq \scratch_reg, \scratch_reg2 + andq $(0x7FF), \scratch_reg /* mask ASID */ + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + /* Flush needed, clear the bit */ + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + movq \scratch_reg2, \scratch_reg + jmp .Lwrcr3_\@ + +.Lnoflush_\@: + movq \scratch_reg2, \scratch_reg + SET_NOFLUSH_BIT \scratch_reg + +.Lwrcr3_\@: + /* Flip the PGD and ASID to the user version */ + orq $(PTI_SWITCH_MASK), \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + pushq %rax + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + popq %rax +.endm + +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI + movq %cr3, \scratch_reg + movq \scratch_reg, \save_reg + /* + * Is the "switch mask" all zero? That means that both of + * these are zero: + * + * 1. The user/kernel PCID bit, and + * 2. The user/kernel "bit" that points CR3 to the + * bottom half of the 8k PGD + * + * That indicates a kernel CR3 value, not a user CR3. + */ + testq $(PTI_SWITCH_MASK), \scratch_reg + jz .Ldone_\@ + + ADJUST_KERNEL_CR3 \scratch_reg + movq \scratch_reg, %cr3 + +.Ldone_\@: +.endm + +.macro RESTORE_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * KERNEL pages can always resume with NOFLUSH as we do + * explicit flushes. + */ + bt $X86_CR3_PTI_SWITCH_BIT, \save_reg + jnc .Lnoflush_\@ + + /* + * Check if there's a pending flush for the user ASID we're + * about to set. + */ + movq \save_reg, \scratch_reg + andq $(0x7FF), \scratch_reg + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + jmp .Lwrcr3_\@ + +.Lnoflush_\@: + SET_NOFLUSH_BIT \save_reg + +.Lwrcr3_\@: + /* + * The CR3 write could be avoided when not changing its value, + * but would require a CR3 read *and* a scratch register. + */ + movq \save_reg, %cr3 +.Lend_\@: +.endm + +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req +.endm +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req +.endm +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req +.endm +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req +.endm +.macro RESTORE_CR3 scratch_reg:req save_reg:req +.endm + +#endif + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6..ace8f321a5a1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -941,9 +941,10 @@ ENTRY(debug) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Ldebug_from_sysenter_stack TRACE_IRQS_OFF @@ -984,9 +985,10 @@ ENTRY(nmi) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f81d50d7ceac..f048e384ff54 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -23,7 +23,6 @@ #include <asm/segment.h> #include <asm/cache.h> #include <asm/errno.h> -#include "calling.h" #include <asm/asm-offsets.h> #include <asm/msr.h> #include <asm/unistd.h> @@ -40,6 +39,8 @@ #include <asm/frame.h> #include <linux/err.h> +#include "calling.h" + .code64 .section .entry.text, "ax" @@ -140,6 +141,67 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Note: using %rsp as a scratch reg. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + jmp *%rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -149,6 +211,10 @@ ENTRY(entry_SYSCALL_64) */ swapgs + /* + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it + * is not required to switch CR3. + */ movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -330,8 +396,25 @@ syscall_return_via_sysret: popq %rsi /* skip rcx */ popq %rdx popq %rsi + + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + popq %rdi - movq RSP-ORIG_RAX(%rsp), %rsp + popq %rsp USERGS_SYSRET64 END(entry_SYSCALL_64) @@ -466,12 +549,13 @@ END(irq_entries_start) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY - pushfq - testl $X86_EFLAGS_IF, (%rsp) + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 .Lokay_\@: - addq $8, %rsp + popq %rax #endif .endm @@ -563,6 +647,13 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + + testb $3, CS-ORIG_RAX(%rsp) + jz 1f + SWAPGS + call switch_to_thread_stack +1: + ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS @@ -572,12 +663,8 @@ END(irq_entries_start) jz 1f /* - * IRQ from user mode. Switch to kernel gsbase and inform context - * tracking that we're in kernel mode. - */ - SWAPGS - - /* + * IRQ from user mode. + * * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode * (which can take locks). Since TRACE_IRQS_OFF idempotent, @@ -630,10 +717,43 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) ud2 1: #endif - SWAPGS POP_EXTRA_REGS - POP_C_REGS - addq $8, %rsp /* skip regs->orig_ax */ + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ + popq %rdi + SWAPGS INTERRUPT_RETURN @@ -713,7 +833,9 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS + SWAPGS /* to kernel GS */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -729,7 +851,6 @@ native_irq_return_ldt: /* Now RAX == RSP. */ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ - popq %rdi /* Restore user RDI */ /* * espfix_stack[31:16] == 0. The page tables are set up such that @@ -740,7 +861,11 @@ native_irq_return_ldt: * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax - SWAPGS + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + movq %rax, %rsp UNWIND_HINT_IRET_REGS offset=8 @@ -829,7 +954,35 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + +/* + * Switch to the thread stack. This is called with the IRET frame and + * orig_ax on the stack. (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) + UNWIND_HINT_FUNC + + pushq %rdi + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi + ret +END(switch_to_thread_stack) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -848,11 +1001,12 @@ ENTRY(\sym) ALLOC_PT_GPREGS_ON_STACK - .if \paranoid - .if \paranoid == 1 + .if \paranoid < 2 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ - jnz 1f + jnz .Lfrom_usermode_switch_stack_\@ .endif + + .if \paranoid call paranoid_entry .else call error_entry @@ -894,20 +1048,15 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid == 1 + .if \paranoid < 2 /* - * Paranoid entry from userspace. Switch stacks and treat it + * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ -1: +.Lfrom_usermode_switch_stack_\@: call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - call sync_regs - movq %rax, %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code @@ -1119,7 +1268,11 @@ ENTRY(paranoid_entry) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx, %ebx -1: ret + +1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + + ret END(paranoid_entry) /* @@ -1141,6 +1294,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: @@ -1168,8 +1322,18 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + /* * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode @@ -1206,6 +1370,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1215,10 +1380,11 @@ ENTRY(error_entry) .Lerror_bad_iret: /* - * We came from an IRET to user mode, so we have user gsbase. - * Switch to kernel gsbase: + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* * Pretend that the exception came from user mode: set up pt_regs @@ -1250,6 +1416,10 @@ END(error_exit) /* * Runs on exception stack. Xen PV does not go through this path at all, * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ ENTRY(nmi) UNWIND_HINT_IRET_REGS @@ -1313,6 +1483,7 @@ ENTRY(nmi) swapgs cld + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -1565,6 +1736,8 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 568e130d932c..40f17009ec20 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -48,7 +48,11 @@ */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + SWAPGS + + /* We are about to clobber %rsp anyway, clobbering here is OK */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq $0 /* pt_regs->r15 = 0 */ /* + * We just saved %rdi so it is safe to clobber. It is not + * preserved during the C calls inside TRACE_IRQS_OFF anyway. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. */ @@ -256,10 +266,22 @@ sysret32_from_system_call: * when the system call started, which is already known to user * code. We zero R8-R10 to avoid info leaks. */ + movq RSP-ORIG_RAX(%rsp), %rsp + + /* + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored + * on the process stack which is not mapped to userspace and + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 + * switch until after after the last reference to the process + * stack. + * + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. + */ + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 + xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 - movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl END(entry_SYSCALL_compat) @@ -306,8 +328,11 @@ ENTRY(entry_INT80_compat) */ movl %eax, %eax - /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ + + /* switch to thread stack expects orig_ax to be pushed */ + call switch_to_thread_stack + pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 11b13c4b43d5..f19856d95c60 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -324,5 +324,5 @@ notrace time_t __vdso_time(time_t *t) *t = result; return result; } -int time(time_t *t) +time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index f279ba2643dc..577fa8adb785 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -37,6 +37,7 @@ #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> +#include <asm/paravirt.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" @@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) WARN_ON_ONCE(address != regs->ip); + /* This should be unreachable in NATIVE mode. */ + if (WARN_ON(vsyscall_mode == NATIVE)) + return false; + if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); @@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr) return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range. Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present. If so, we skip calling + * this. + */ +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 + p4d->p4d |= _PAGE_USER; +#endif + pud = pud_offset(p4d, VSYSCALL_ADDR); + set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); + pmd = pmd_offset(pud, VSYSCALL_ADDR); + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} + void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) + if (vsyscall_mode != NONE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); + set_vsyscall_pgtable_user_bits(swapper_pg_dir); + } BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09c26a4f139c..731153a4681e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = { __init int intel_pmu_init(void) { + struct attribute **extra_attr = NULL; + struct attribute **to_free = NULL; union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; @@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; - struct attribute **extra_attr = NULL; char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void) extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; extra_attr = merge_attr(extra_attr, skl_format_attr); + to_free = extra_attr; x86_pmu.cpu_events = get_hsw_events_attrs(); intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); @@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3674a4b6f8bd..8f0aace08b87 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -3,16 +3,18 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/cpu_entry_area.h> #include <asm/perf_event.h> #include <asm/insn.h> #include "../perf_event.h" +/* Waste a full page so it can be mapped into the cpu_entry_area */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* @@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); -static int alloc_pebs_buffer(int cpu) +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + phys_addr_t pa; + size_t msz = 0; + + pa = virt_to_phys(addr); + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); +} + +static void ds_clear_cea(void *cea, size_t size) +{ + size_t msz = 0; + + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +} + +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +{ + unsigned int order = get_order(size); int node = cpu_to_node(cpu); - int max; - void *buffer, *ibuffer; + struct page *page; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + return page ? page_address(page) : NULL; +} + +static void dsfree_pages(const void *buffer, size_t size) +{ + if (buffer) + free_pages((unsigned long)buffer, get_order(size)); +} + +static int alloc_pebs_buffer(int cpu) +{ + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + size_t bsiz = x86_pmu.pebs_buffer_size; + int max, node = cpu_to_node(cpu); + void *buffer, *ibuffer, *cea; if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; @@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu) if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree_pages(buffer, bsiz); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; } - - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_pebs_vaddr = buffer; + /* Update the cpu entry area mapping */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds->pebs_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; return 0; } static void release_pebs_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.pebs) return; @@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu) kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->ds_pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max, thresh; - void *buffer; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *buffer, *cea; + int max; if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; } - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_bts_vaddr = buffer; + /* Update the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds->bts_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); + ds->bts_absolute_maximum = ds->bts_buffer_base + max; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); return 0; } static void release_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds_clear_cea(cea, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); + hwev->ds_bts_vaddr = NULL; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; - return 0; } static void release_ds_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7aaadf9331f..8e4ea143ed96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,8 @@ #include <linux/perf_event.h> +#include <asm/intel_ds.h> + /* To enable MSR tracing please use the generic trace points. */ /* @@ -77,8 +79,6 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* @@ -95,23 +95,6 @@ struct amd_nb { PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - #define PEBS_REGS \ (PERF_REG_X86_AX | \ PERF_REG_X86_BX | \ @@ -216,6 +199,8 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; + void *ds_pebs_vaddr; + void *ds_bts_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..386a6900e206 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,6 +136,7 @@ #endif #ifndef __ASSEMBLY__ +#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -145,5 +146,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif +#endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h new file mode 100644 index 000000000000..4a7884b8dca5 --- /dev/null +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include <linux/percpu-defs.h> +#include <asm/processor.h> +#include <asm/intel_ds.h> + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code. Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* + * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +#ifdef CONFIG_CPU_SUP_INTEL + /* + * Per CPU debug store for Intel performance monitoring. Wastes a + * full page at the moment. + */ + struct debug_store cpu_debug_store; + /* + * The actual PEBS/BTS buffers must be mapped to user space + * Reserve enough fixmap PTEs. + */ + struct debug_store_buffers cpu_debug_buffers; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + +extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) + +#define CPU_ENTRY_AREA_MAP_SIZE \ + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + +extern struct cpu_entry_area *get_cpu_entry_area(int cpu); + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} + +#endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bf6a76202a77..ea9a7dde62e5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..07cdd1715705 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -197,11 +197,12 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ - +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ @@ -266,6 +267,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -339,5 +341,6 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4011cb03ef08..13c5ee878a47 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -7,6 +7,7 @@ #include <asm/mmu.h> #include <asm/fixmap.h> #include <asm/irq_vectors.h> +#include <asm/cpu_entry_area.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -20,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->type = (info->read_exec_only ^ 1) << 1; desc->type |= info->contents << 2; + /* Set the ACCESS bit so it can be mapped RO */ + desc->type |= 1; desc->s = 1; desc->dpl = 0x3; @@ -60,17 +63,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) return this_cpu_ptr(&gdt_page)->gdt; } -/* Get the fixmap index for a specific processor */ -static inline unsigned int get_cpu_gdt_ro_index(int cpu) -{ - return FIX_GDT_REMAP_BEGIN + cpu; -} - /* Provide the fixmap address of the remapped GDT */ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) { - unsigned int idx = get_cpu_gdt_ro_index(cpu); - return (struct desc_struct *)__fix_to_virt(idx); + return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; } /* Provide the current read-only GDT */ @@ -185,7 +181,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, #endif } -static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) { struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 14d6d5007314..b027633e7300 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,12 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define DISABLE_PTI 0 +#else +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -60,7 +66,7 @@ #define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 -#define DISABLED_MASK7 0 +#define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 #define DISABLED_MASK9 (DISABLE_MPX) #define DISABLED_MASK10 0 diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 0211029076ea..6777480d8a42 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64 #include <asm/percpu.h> @@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); extern void init_espfix_ap(int cpu); - -#endif /* CONFIG_X86_64 */ +#else +static inline void init_espfix_ap(int cpu) { } +#endif #endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0c505fe9a95..64c4a30e0d39 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP; PAGE_SIZE) #endif - /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at @@ -84,7 +83,6 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif - FIX_RO_IDT, /* Virtual mapping for read-only IDT */ #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -100,9 +98,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif - /* Fixmap entries to remap the GDTs, one per processor. */ - FIX_GDT_REMAP_BEGIN, - FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ @@ -143,7 +138,7 @@ enum fixed_addresses { extern void reserve_top_address(unsigned long reserve); #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) extern int fixmaps_set; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 1b0a5abcd8ae..96aa6b9884dc 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,16 +20,7 @@ #ifndef _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H -#ifdef CONFIG_HYPERVISOR_GUEST - -#include <asm/kvm_para.h> -#include <asm/x86_init.h> -#include <asm/xen/hypervisor.h> - -/* - * x86 hypervisor information - */ - +/* x86 hypervisor types */ enum x86_hypervisor_type { X86_HYPER_NATIVE = 0, X86_HYPER_VMWARE, @@ -39,6 +30,12 @@ enum x86_hypervisor_type { X86_HYPER_KVM, }; +#ifdef CONFIG_HYPERVISOR_GUEST + +#include <asm/kvm_para.h> +#include <asm/x86_init.h> +#include <asm/xen/hypervisor.h> + struct hypervisor_x86 { /* Hypervisor name */ const char *name; @@ -58,7 +55,15 @@ struct hypervisor_x86 { extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return x86_hyper_type == type; +} #else static inline void init_hypervisor_platform(void) { } +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return type == X86_HYPER_NATIVE; +} #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h new file mode 100644 index 000000000000..62a9f4966b42 --- /dev/null +++ b/arch/x86/include/asm/intel_ds.h @@ -0,0 +1,36 @@ +#ifndef _ASM_INTEL_DS_H +#define _ASM_INTEL_DS_H + +#include <linux/percpu-defs.h> + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +} __aligned(PAGE_SIZE); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + +struct debug_store_buffers { + char bts_buffer[BTS_BUFFER_SIZE]; + char pebs_buffer[PEBS_BUFFER_SIZE]; +}; + +#endif diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h new file mode 100644 index 000000000000..989cfa86de85 --- /dev/null +++ b/arch/x86/include/asm/invpcid.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVPCID +#define _ASM_X86_INVPCID + +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + struct { u64 d[2]; } desc = { { pcid, addr } }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + +#endif /* _ASM_X86_INVPCID */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 139feef467f7..c066ffae222b 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); extern int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early); + struct irq_data *irq_data, bool reserve); extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c8ef23f2c28f..89f08955fff7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void) swapgs; \ sysretl +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x) pushfq; popq %rax +#endif #else #define INTERRUPT_RETURN iret #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index f86a8caa561e..395c9631e000 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); +extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/include/asm/kmemcheck.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 034caa1a084e..b24b1c8b3979 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -214,8 +214,6 @@ struct x86_emulate_ops { void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); - void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ - void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 977de5fb968b..516798431328 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,7 +536,20 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + /* + * QEMU userspace and the guest each have their own FPU state. + * In vcpu_run, we switch between the user and guest FPU contexts. + * While running a VCPU, the VCPU thread will have the guest FPU + * context. + * + * Note that while the PKRU state lives inside the fpu registers, + * it is switched out separately at VMENTER and VMEXIT time. The + * "guest_fpu" state here contains the guest FPU context, with the + * host PRKU bits. + */ + struct fpu user_fpu; struct fpu guest_fpu; + u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -1435,4 +1448,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9ea26f167497..5ff3e8af2c20 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,6 +3,7 @@ #define _ASM_X86_MMU_H #include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/mutex.h> #include <linux/atomic.h> @@ -27,7 +28,8 @@ typedef struct { atomic64_t tlb_gen; #ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; + struct rw_semaphore ldt_usr_sem; + struct ldt_struct *ldt; #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6d16d15d09a0..c931b88982a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -50,22 +50,53 @@ struct ldt_struct { * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ - struct desc_struct *entries; - unsigned int nr_entries; + struct desc_struct *entries; + unsigned int nr_entries; + + /* + * If PTI is in use, then the entries array is not mapped while we're + * in user mode. The whole array will be aliased at the addressed + * given by ldt_slot_va(slot). We use two slots so that we can allocate + * and map, and enable a new LDT without invalidating the mapping + * of an older, still-in-use LDT. + * + * slot will be -1 if this LDT doesn't have an alias mapping. + */ + int slot; }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ +#ifdef CONFIG_X86_64 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +#else + BUG(); +#endif +} + /* * Used for LDT copy/destruction. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +static inline void init_new_context_ldt(struct mm_struct *mm) +{ + mm->context.ldt = NULL; + init_rwsem(&mm->context.ldt_usr_sem); +} +int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context_ldt(struct task_struct *tsk, - struct mm_struct *mm) +static inline void init_new_context_ldt(struct mm_struct *mm) { } +static inline int ldt_dup_context(struct mm_struct *oldmm, + struct mm_struct *mm) { return 0; } -static inline void destroy_context_ldt(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) { } +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif static inline void load_mm_ldt(struct mm_struct *mm) @@ -90,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) * that we can see. */ - if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->nr_entries); - else + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { clear_LDT(); + } #else clear_LDT(); #endif @@ -132,18 +184,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mutex_init(&mm->context.lock); + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } - #endif - return init_new_context_ldt(tsk, mm); +#endif + init_new_context_ldt(mm); + return 0; } static inline void destroy_context(struct mm_struct *mm) { @@ -176,15 +231,16 @@ do { \ } while (0) #endif -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { paravirt_arch_dup_mmap(oldmm, mm); + return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); + ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 @@ -282,33 +338,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, } /* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID - * bits. This serves two purposes. It prevents a nasty situation in - * which PCID-unaware code saves CR3, loads some other value (with PCID - * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if - * the saved ASID was nonzero. It also means that any bugs involving - * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger - * deterministically. - */ - -static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) -{ - if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > 4094); - return __sme_pa(mm->pgd) | (asid + 1); - } else { - VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(mm->pgd); - } -} - -static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) -{ - VM_WARN_ON_ONCE(asid > 4094); - return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; -} - -/* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). * @@ -317,7 +346,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 283efcaac8af..892df375b615 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -927,6 +927,15 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif + #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 4b5e1eafada7..aff42e1da6ee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Instead of one PGD, we acquire two PGDs. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + /* * Allocate and free page tables. */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 95e2dfd75521..e42b8943cb1a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX @@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) static inline int p4d_bad(p4d_t p4d) { - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + unsigned long ignore_flags = _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd) * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +/* + * a shortcut to get a pgd_t in a given mm + */ +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's @@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud) */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + /* Clone the user space pgd as well */ + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), + count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index f2ca9b28fd68..ce245b0cdfca 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ - & PMD_MASK) +/* + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c + * to avoid include recursion hell + */ +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) + +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) + +#define PKMAP_BASE \ + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e9f05331e732..81462e9a34f6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); +#else *p4dp = p4d; +#endif } static inline void native_p4d_clear(p4d_t *p4d) @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + *pgdp = pti_set_user_pgd(pgdp, pgd); +#else *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6d5f45dcd4a1..b97a539bcdee 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -76,32 +76,45 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) + #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(16384, UL) -#define __VMALLOC_BASE _AC(0xff92000000000000, UL) -#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define VMALLOC_SIZE_TB _AC(12800, UL) +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) +# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define LDT_PGD_ENTRY _AC(-112, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else -#define VMALLOC_SIZE_TB _AC(32, UL) -#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define VMALLOC_SIZE_TB _AC(32, UL) +# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define LDT_PGD_ENTRY _AC(-4, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif + #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START vmalloc_base -#define VMEMMAP_START vmemmap_base +# define VMALLOC_START vmalloc_base +# define VMEMMAP_START vmemmap_base #else -#define VMALLOC_START __VMALLOC_BASE -#define VMEMMAP_START __VMEMMAP_BASE +# define VMALLOC_START __VMALLOC_BASE +# define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) + +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) + +#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) + +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 43212a43ee69..6a60fea90b9d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -38,6 +38,11 @@ #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_SWITCH_BIT 11 +#endif + #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cc16fa882e3e..d3a67fba200a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -163,9 +163,9 @@ enum cpuid_regs_idx { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct tss_struct doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +extern struct x86_hw_tss doublefault_tss; +extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); @@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir) write_cr3(__sme_pa(pgdir)); } +/* + * Note that while the legacy 'TSS' name comes from 'Task State Segment', + * on modern x86 CPUs the TSS also holds information important to 64-bit mode, + * unrelated to the task-switch mechanism: + */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { @@ -305,7 +310,13 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; + + /* + * We store cpu_current_top_of_stack in sp1 so it's always accessible. + * Linux does not use ring 1, so sp1 is not otherwise needed. + */ u64 sp1; + u64 sp2; u64 reserved2; u64 ist[7]; @@ -323,12 +334,22 @@ struct x86_hw_tss { #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 +struct entry_stack { + unsigned long words[64]; +}; + +struct entry_stack_page { + struct entry_stack stack; +} __aligned(PAGE_SIZE); + struct tss_struct { /* - * The hardware state: + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering + * errata. */ struct x86_hw_tss x86_tss; @@ -339,18 +360,9 @@ struct tss_struct { * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +} __aligned(PAGE_SIZE); -#ifdef CONFIG_X86_32 - /* - * Space for the temporary SYSENTER stack. - */ - unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; -#endif - -} ____cacheline_aligned; - -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* * sizeof(unsigned long) coming from an extra "long" at the end @@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#else +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif /* @@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask) static inline void native_load_sp0(unsigned long sp0) { - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -535,12 +550,12 @@ static inline void native_swapgs(void) static inline unsigned long current_top_of_stack(void) { -#ifdef CONFIG_X86_64 - return this_cpu_read_stable(cpu_tss.x86_tss.sp0); -#else - /* sp0 on x86_32 is special in and around vm86 mode. */ + /* + * We can't read directly from tss.sp0: sp0 on x86_32 is special in + * and around vm86 mode and sp0 on x86_64 is special because of the + * entry trampoline. + */ return this_cpu_read_stable(cpu_current_top_of_stack); -#endif } static inline bool on_thread_stack(void) @@ -837,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x) #else /* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] */ #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h new file mode 100644 index 000000000000..0b5ef05b2d2d --- /dev/null +++ b/arch/x86/include/asm/pti.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _ASM_X86_PTI_H +#define _ASM_X86_PTI_H +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern void pti_init(void); +extern void pti_check_boottime_disable(void); +#else +static inline void pti_check_boottime_disable(void) { } +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index b20f9d623f9c..8f09012b92e7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -236,11 +236,23 @@ */ #define EARLY_IDT_HANDLER_SIZE 9 +/* + * xen_early_idt_handler_array is for Xen pv guests: for each entry in + * early_idt_handler_array it contains a prequel in the form of + * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to + * max 8 bytes. + */ +#define XEN_EARLY_IDT_HANDLER_SIZE 8 + #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; +#endif + /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 8da111b3c342..f73706878772 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,6 +16,7 @@ enum stack_type { STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, + STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -28,6 +29,8 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info); + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; @@ -30,8 +42,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..eb5f7999a893 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); /* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *prev, - struct task_struct *next) +static inline void prepare_switch_to(struct task_struct *next) { #ifdef CONFIG_VMAP_STACK /* @@ -70,7 +69,7 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(prev, next); \ + prepare_switch_to(next); \ \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) @@ -79,10 +78,10 @@ do { \ static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) return; - this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } #endif @@ -90,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { + /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); #else - load_sp0(task_top_of_stack(task)); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 70f425947dc5..00223333821a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) #endif #endif diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 509046cfa5ce..4a08dd2ab32a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -9,70 +9,130 @@ #include <asm/cpufeature.h> #include <asm/special_insns.h> #include <asm/smp.h> +#include <asm/invpcid.h> +#include <asm/pti.h> +#include <asm/processor-flags.h> -static inline void __invpcid(unsigned long pcid, unsigned long addr, - unsigned long type) -{ - struct { u64 d[2]; } desc = { { pcid, addr } }; +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ - /* - * The memory clobber is because the whole point is to invalidate - * stale TLB entries and, especially if we're flushing global - * mappings, we don't want the compiler to reorder any subsequent - * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. - */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); -} +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 -#define INVPCID_TYPE_INDIV_ADDR 0 -#define INVPCID_TYPE_SINGLE_CTXT 1 -#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 -#define INVPCID_TYPE_ALL_NON_GLOBAL 3 +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif -/* Flush all mappings for a given pcid and addr, not including globals. */ -static inline void invpcid_flush_one(unsigned long pcid, - unsigned long addr) -{ - __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); -} +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) -/* Flush all mappings for a given PCID, not including globals. */ -static inline void invpcid_flush_single_context(unsigned long pcid) +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) { - __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; } -/* Flush all mappings, including globals, for all PCIDs. */ -static inline void invpcid_flush_all(void) +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) { - __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_SWITCH_BIT; +#endif + return ret; } -/* Flush all mappings for all PCIDs except globals. */ -static inline void invpcid_flush_all_nonglobals(void) +struct pgd_t; +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) { - __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); + if (static_cpu_has(X86_FEATURE_PCID)) { + return __sme_pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); + } } -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) { - u64 new_tlb_gen; - - /* - * Bump the generation count. This also serves as a full barrier - * that synchronizes with switch_mm(): callers are required to order - * their read of mm_cpumask after their writes to the paging - * structures. - */ - smp_mb__before_atomic(); - new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); - smp_mb__after_atomic(); - - return new_tlb_gen; + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } #ifdef CONFIG_PARAVIRT @@ -99,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void) return !static_cpu_has(X86_FEATURE_PCID); } -/* - * 6 because 6 should be plenty and struct tlb_state will fit in - * two cache lines. - */ -#define TLB_NR_DYN_ASIDS 6 - struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -139,6 +193,24 @@ struct tlb_state { bool is_lazy; /* + * If set we changed the page tables in such a way that we + * needed an invalidation of all contexts (aka. PCIDs / ASIDs). + * This tells us to go invalidate all the non-loaded ctxs[] + * on the next context switch. + * + * The current ctx was kept up-to-date as it ran and does not + * need to be invalidated. + */ + bool invalidate_other; + + /* + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate + * the corresponding user PCID needs a flush next time we + * switch to it; see SWITCH_TO_USER_CR3. + */ + unsigned short user_pcid_flush_mask; + + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ @@ -173,40 +245,43 @@ static inline void cr4_init_shadow(void) this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } +static inline void __cr4_set(unsigned long cr4) +{ + lockdep_assert_irqs_disabled(); + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { - unsigned long cr4; + unsigned long cr4, flags; + local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 | mask) != cr4) { - cr4 |= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); - } + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + local_irq_restore(flags); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits(unsigned long mask) { - unsigned long cr4; + unsigned long cr4, flags; + local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 & ~mask) != cr4) { - cr4 &= ~mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); - } + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + local_irq_restore(flags); } -static inline void cr4_toggle_bits(unsigned long mask) +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long cr4; cr4 = this_cpu_read(cpu_tlbstate.cr4); - cr4 ^= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); + __cr4_set(cr4 ^ mask); } /* Read the CR4 shadow. */ @@ -216,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void) } /* + * Mark all other ASIDs as invalid, preserves the current. + */ +static inline void invalidate_other_asid(void) +{ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot * up after us can get the correct flags. This should only be used @@ -234,37 +317,63 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) extern void initialize_tlbstate_and_flush(void); -static inline void __native_flush_tlb(void) +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) { + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. */ - preempt_disable(); - native_write_cr3(__native_read_cr3()); - preempt_enable(); + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -static inline void __native_flush_tlb_global_irq_disabled(void) +/* + * flush the entire current user mapping + */ +static inline void __native_flush_tlb(void) { - unsigned long cr4; + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); } +/* + * flush everything + */ static inline void __native_flush_tlb_global(void) { - unsigned long flags; + unsigned long cr4, flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -277,36 +386,69 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - __native_flush_tlb_global_irq_disabled(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); raw_local_irq_restore(flags); } +/* + * flush one page in the user mapping + */ static inline void __native_flush_tlb_single(unsigned long addr) { + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); } +/* + * flush everything + */ static inline void __flush_tlb_all(void) { - if (boot_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) { __flush_tlb_global(); - else + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ __flush_tlb(); - - /* - * Note: if we somehow had PCID but not PGE, then this wouldn't work -- - * we'd end up flushing kernel translations for the current ASID but - * we might fail to flush kernel translations for other cached ASIDs. - * - * To avoid this issue, we force PCID off if PGE is off. - */ + } } +/* + * flush one page in the kernel mapping + */ static inline void __flush_tlb_one(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * __flush_tlb_single() will have cleared the TLB entry for this ASID, + * but since kernel space is replicated across all, we must also + * invalidate all others. + */ + invalidate_other_asid(); } #define TLB_FLUSH_ALL -1UL @@ -367,6 +509,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + return atomic64_inc_return(&mm->context.tlb_gen); +} + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 84b9ec0c1bc0..22647a642e98 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed, DECLARE_EVENT_CLASS(vector_activate, TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, - bool early), + bool reserve), - TP_ARGS(irq, is_managed, can_reserve, early), + TP_ARGS(irq, is_managed, can_reserve, reserve), TP_STRUCT__entry( __field( unsigned int, irq ) __field( bool, is_managed ) __field( bool, can_reserve ) - __field( bool, early ) + __field( bool, reserve ) ), TP_fast_assign( __entry->irq = irq; __entry->is_managed = is_managed; __entry->can_reserve = can_reserve; - __entry->early = early; + __entry->reserve = reserve; ), - TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d", + TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d", __entry->irq, __entry->is_managed, __entry->can_reserve, - __entry->early) + __entry->reserve) ); #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \ DEFINE_EVENT_FN(vector_activate, name, \ TP_PROTO(unsigned int irq, bool is_managed, \ - bool can_reserve, bool early), \ - TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL); \ + bool can_reserve, bool reserve), \ + TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \ DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate); DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff68..31051f35cbb7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); #ifdef CONFIG_X86_64 dotraplinkage void do_double_fault(struct pt_regs *, long); -asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9cc6fe1fc6f..c1688c2d0a12 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -7,6 +7,9 @@ #include <asm/ptrace.h> #include <asm/stacktrace.h> +#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) +#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) + struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; @@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) +/* + * WARNING: The entire pt_regs may not be safe to dereference. In some cases, + * only the iret frame registers are accessible. Use with caution! + */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d9a7c659009c..b986b2ca688a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -7,6 +7,7 @@ #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); +extern void set_vsyscall_pgtable_user_bits(pgd_t *root); /* * Called on instruction fetch fault in vsyscall page. diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index da1489cb64dc..1e901e421f2d 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,6 +1,7 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h generated-y += unistd_32.h generated-y += unistd_64.h generated-y += unistd_x32.h diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 7e1e730396ae..bcba3c643e63 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -78,7 +78,12 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS 12 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e272f3ea984..880441f24146 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 else { pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } +#endif return 0; } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690e9b64..25a87028cb3f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4480c9..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 201579dc5242..8a7963421460 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, } int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { unsigned long flags; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017de806..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6a823a25eaff..f8b03bb8e725 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd) irq_matrix_reserve(vector_matrix); apicd->can_reserve = true; apicd->has_reserved = true; + irqd_set_can_reserve(irqd); trace_vector_reserve(irqd->irq, 0); vector_assign_managed_shutdown(irqd); } @@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd) int ret; ret = assign_irq_vector_any_locked(irqd); - if (!ret) + if (!ret) { apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } return ret; } @@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd) } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, - bool early) + bool reserve) { struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; int ret = 0; trace_vector_activate(irqd->irq, apicd->is_managed, - apicd->can_reserve, early); + apicd->can_reserve, reserve); /* Nothing to do for fixed assigned vectors */ if (!apicd->can_reserve && !apicd->is_managed) return 0; raw_spin_lock_irqsave(&vector_lock, flags); - if (early || irqd_is_managed_and_shutdown(irqd)) + if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); @@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, } else { /* Release the vector */ apicd->can_reserve = true; + irqd_set_can_reserve(irqd); clear_irq_vector(irqd); realloc = true; } @@ -542,8 +554,8 @@ error: } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS -void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, - struct irq_data *irqd, int ind) +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) { unsigned int cpu, vector, prev_cpu, prev_vector; struct apic_chip_data *apicd; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 622f13ca8a94..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@ #include <asm/sigframe.h> #include <asm/bootparam.h> #include <asm/suspend.h> +#include <asm/tlbflush.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -93,4 +94,13 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..fa1261eefa16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,13 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); - - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void) #ifdef CONFIG_PARAVIRT OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif @@ -63,6 +66,7 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..bcb75dc97d44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x17: init_amd_zn(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fa998ca8aa5a..c47de4ebf63a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -490,28 +490,23 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) -{ -#ifdef CONFIG_X86_64 - /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; -#else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. - * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. - */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); -} +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; +#endif /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) @@ -747,7 +742,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } @@ -927,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + /* Assume for now that ALL x86 CPUs are insecure */ + setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + fpu__init_system(c); #ifdef CONFIG_X86_32 @@ -1250,7 +1249,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1259,11 +1258,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1357,25 +1352,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1386,7 +1378,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1530,7 +1522,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1569,7 +1561,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1580,7 +1572,7 @@ void cpu_init(void) } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1596,11 +1588,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, me); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); @@ -1612,7 +1605,6 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } @@ -1622,7 +1614,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); @@ -1657,12 +1649,12 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ @@ -1674,7 +1666,6 @@ void cpu_init(void) fpu__init_cpu(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c6daec4bdba5..330b8462d426 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7dbcb7adf797..8ccdca6d3f9e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci) } #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc; @@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (rev != mc->hdr.rev) return -1; -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif uci->cpu_sig.rev = rev; if (early) diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void) cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), - } +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..5fa110699ed2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/sysfs.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> #include <asm/unwind.h> @@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } +bool in_entry_stack(unsigned long *stack, struct stack_info *info) +{ + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -50,6 +69,28 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +{ + if (on_stack(info, regs, sizeof(*regs))) + __show_regs(regs, 0); + else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -71,31 +112,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - entry stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - entry stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); /* * Scan the stack, printing any text addresses we find. At the @@ -119,7 +164,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_safe() below. */ if (regs && stack == ®s->ip) goto next; @@ -155,8 +200,8 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); } if (stack_name) @@ -252,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; + return NULL; } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; + if (in_entry_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; + if (in_entry_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -350,13 +371,14 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 49cfd9fe7589..68e1867cca80 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1c1eae961340..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -5,6 +5,11 @@ * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_sem + * context.lock */ #include <linux/errno.h> @@ -19,6 +24,7 @@ #include <linux/uaccess.h> #include <asm/ldt.h> +#include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/syscalls.h> @@ -42,17 +48,15 @@ static void refresh_ldt_segments(void) #endif } -/* context.lock is held for us, so we don't need any locking. */ +/* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, - struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { + mutex_lock(&mm->context.lock); + /* Synchronizes with READ_ONCE in load_mm_ldt. */ - smp_store_release(¤t_mm->context.ldt, ldt); + smp_store_release(&mm->context.ldt, ldt); - /* Activate the LDT for all CPUs using current_mm. */ - on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + + mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) @@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) } /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; - struct mm_struct *old_mm; int retval = 0; - mutex_init(&mm->context.lock); - old_mm = current->mm; - if (!old_mm) { - mm->context.ldt = NULL; + if (!old_mm) return 0; - } mutex_lock(&old_mm->context.lock); - if (!old_mm->context.ldt) { - mm->context.ldt = NULL; + if (!old_mm->context.ldt) goto out_unlock; - } new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { @@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; - mutex_lock(&mm->context.lock); + down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; @@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) retval = bytecount; out_unlock: - mutex_unlock(&mm->context.lock); + up_read(&mm->context.ldt_usr_sem); return retval; } @@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ldt.avl = 0; } - mutex_lock(&mm->context.lock); + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: - mutex_unlock(&mm->context.lock); + up_write(&mm->context.ldt_usr_sem); out: return error; } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 97fb3e5737f5..aed9d94bd46f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower @@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, @@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 - .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); @@ -299,7 +306,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } if ((tifp ^ tifn) & _TIF_NOTSC) - cr4_toggle_bits(X86_CR4_TSD); + cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c25..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); @@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); @@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_sp0(next_p); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3d01df7d7cf6..ed556d50d7ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -126,14 +126,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; - pr_debug("2.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; - pr_debug("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) @@ -141,11 +137,6 @@ static inline void smpboot_restore_warm_reset_vector(void) unsigned long flags; /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* * Paranoid: Set warm reset code and vector here back * to default values. */ @@ -237,7 +228,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - + load_current_idt(); cpu_init(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); @@ -932,12 +923,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; - /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 + /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); -#endif /* So we see what's up */ announce_cpu(cpu, apicid); @@ -1304,7 +1291,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. */ - ncpus = cpu_data(0).booted_cores * smp_num_siblings; + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 77835bc021c7..20161ef53537 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, { int ret; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ if (!try_get_task_stack(tsk)) - return -EINVAL; + return 0; ret = __save_stack_trace_reliable(trace, tsk); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { + if (LDT_empty(info) || LDT_zero(info)) memset(desc, 0, sizeof(*desc)); - } else { + else fill_ldt(desc, info); - - /* - * Always set the accessed bit so that the CPU - * doesn't try to write to the (read-only) GDT. - */ - desc->type |= 1; - } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 989514c94a55..446c9ef8cfc3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/fpu/internal.h> +#include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> @@ -348,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) /* * If IRET takes a non-IST fault on the espfix64 stack, then we - * end up promoting it to a doublefault. In that case, modify - * the stack to make it look like we just entered the #GP - * handler from user space, similar to bad_iret. + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *normal_regs = task_pt_regs(current); + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Fake a #GP(0) from userspace. */ - memmove(&normal_regs->ip, (void *)regs->sp, 5*8); - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + */ regs->ip = (unsigned long)general_protection; - regs->sp = (unsigned long)&normal_regs->orig_ax; + regs->sp = (unsigned long)&gpregs->orig_ax; return; } @@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being - * deliv- ered, the faulting linear address of the second fault will + * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a @@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = task_pt_regs(current); - *regs = *eregs; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault - * correctly, we want move our stack frame to task_pt_regs - * and we want to pretend that the exception came from the - * iret target. + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - container_of(task_pt_regs(current), - struct bad_iret_stack, regs); + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); @@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: -#if defined(CONFIG_X86_32) - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); -#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + idt_setup_traps(); /* @@ -936,8 +952,9 @@ void __init trap_init(void) * "sidt" instruction will not leak the location of the kernel, and * to defend the IDT against arbitrary memory write vulnerabilities. * It will be reloaded in cpu_init() */ - __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); - idt_descr.address = fix_to_virt(FIX_RO_IDT); + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; /* * Should be a barrier for any external CPU state: diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a..be86a865087a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len) { struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; - /* - * If the address isn't on the current stack, switch to the next one. - * - * We may have to traverse multiple stacks to deal with the possibility - * that info->next_sp could point to an empty stack and the address - * could be on a subsequent stack. - */ - while (!on_stack(info, (void *)addr, len)) - if (get_stack_info(info->next_sp, state->task, info, - &state->stack_mask)) - return false; + if (!on_stack(info, addr, len) && + (get_stack_info(addr, state->task, info, &state->stack_mask))) + return false; return true; } @@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, return true; } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, - unsigned long *ip, unsigned long *sp, bool full) + unsigned long *ip, unsigned long *sp) { - size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; - size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; - struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - - if (IS_ENABLED(CONFIG_X86_64)) { - if (!stack_access_ok(state, addr, regs_size)) - return false; + struct pt_regs *regs = (struct pt_regs *)addr; - *ip = regs->ip; - *sp = regs->sp; + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - return true; - } - - if (!stack_access_ok(state, addr, sp_offset)) + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; *ip = regs->ip; + *sp = regs->sp; + return true; +} - if (user_mode(regs)) { - if (!stack_access_ok(state, addr + sp_offset, - REGS_SIZE - SP_OFFSET)) - return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - *sp = regs->sp; - } else - *sp = (unsigned long)®s->sp; + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + *ip = regs->ip; + *sp = regs->sp; return true; } @@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; - struct pt_regs *ptregs; bool indirect = false; if (unwind_done(state)) @@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; @@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS_IRET: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } - ptregs = container_of((void *)sp, struct pt_regs, ip); - if ((unsigned long)ptregs >= prev_sp && - on_stack(&state->stack_info, ptregs, REGS_SIZE)) { - state->regs = ptregs; - state->full_regs = false; - } else - state->regs = NULL; - + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; state->signal = true; break; @@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } if (get_stack_info((unsigned long *)state->sp, state->task, - &state->stack_info, &state->stack_mask)) - return; + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies; . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + #else #define X64_ALIGN_RODATA_BEGIN #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END + #endif PHDRS { @@ -102,11 +108,22 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT IRQENTRY_TEXT + ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + /* End of text section */ _etext = .; } :text = 0x9090 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e7d04d0c8008..b514b2b2845a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1046,7 +1046,6 @@ static void fetch_register_operand(struct operand *op) static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1068,13 +1067,11 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) #endif default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1096,12 +1093,10 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, #endif default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1113,12 +1108,10 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1130,7 +1123,6 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1138,9 +1130,7 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fninit"); - ctxt->ops->put_fpu(ctxt); return X86EMUL_CONTINUE; } @@ -1151,9 +1141,7 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fnstcw %0": "+m"(fcw)); - ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fcw; @@ -1167,9 +1155,7 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fnstsw %0": "+m"(fsw)); - ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fsw; @@ -2404,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) } static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr4) + u64 cr0, u64 cr3, u64 cr4) { int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = ctxt->ops->set_cr(ctxt, 3, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; /* * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2425,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, bad = ctxt->ops->set_cr(ctxt, 4, cr4); if (bad) return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + } return X86EMUL_CONTINUE; @@ -2435,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) struct desc_struct desc; struct desc_ptr dt; u16 selector; - u32 val, cr0, cr4; + u32 val, cr0, cr3, cr4; int i; cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2481,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) { struct desc_struct desc; struct desc_ptr dt; - u64 val, cr0, cr4; + u64 val, cr0, cr3, cr4; u32 base3; u16 selector; int i, r; @@ -2505,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr3 = GET_SMSTATE(u64, smbase, 0x7f50); cr4 = GET_SMSTATE(u64, smbase, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); val = GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2533,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); - r = rsm_enter_protected_mode(ctxt, cr0, cr4); + r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); if (r != X86EMUL_CONTINUE) return r; @@ -4001,12 +4005,8 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->get_fpu(ctxt); - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); - ctxt->ops->put_fpu(ctxt); - if (rc != X86EMUL_CONTINUE) return rc; @@ -4049,8 +4049,6 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->get_fpu(ctxt); - if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) @@ -4066,8 +4064,6 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: - ctxt->ops->put_fpu(ctxt); - return rc; } @@ -5317,9 +5313,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; - ctxt->ops->get_fpu(ctxt); rc = asm_safe("fwait"); - ctxt->ops->put_fpu(ctxt); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c6640..c4deb1f34faa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); @@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); @@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); @@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4704aaf6d19e..023afa0c8887 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, - (unsigned long)this_cpu_ptr(&cpu_tss)); + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* @@ -6751,16 +6751,10 @@ static __init int hardware_setup(void) goto out; } - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eee8e7faf1af..1cec2c62a0b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2937,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } @@ -4385,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); handled += n; addr += n; len -= n; @@ -4644,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, val); vcpu->mmio_read_completed = 0; return 1; } @@ -4666,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); return X86EMUL_IO_NEEDED; } @@ -5252,17 +5251,6 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) emul_to_vcpu(ctxt)->arch.halt_request = 1; } -static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_disable(); - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); -} - -static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_enable(); -} - static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) @@ -5340,8 +5328,6 @@ static const struct x86_emulate_ops emulate_ops = { .halt = emulator_halt, .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, - .get_fpu = emulator_get_fpu, - .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, @@ -6778,6 +6764,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) kvm_x86_ops->tlb_flush(vcpu); } +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + unsigned long apic_address; + + /* + * The physical address of apic access page is stored in the VMCS. + * Update it when it becomes invalid. + */ + apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (start <= apic_address && apic_address < end) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; @@ -6952,7 +6952,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -7265,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct fpu *fpu = ¤t->thread.fpu; int r; - fpu__initialize(fpu); - kvm_sigset_activate(vcpu); + kvm_load_guest_fpu(vcpu); + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; @@ -7312,6 +7310,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = vcpu_run(vcpu); out: + kvm_put_guest_fpu(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@ -7381,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) #endif kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags); + kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; @@ -7495,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) { + /* + * When EFER.LME and CR0.PG are set, the processor is in + * 64-bit mode (though maybe in a 32-bit code segment). + * CR4.PAE and EFER.LMA must be set. + */ + if (!(sregs->cr4 & X86_CR4_PAE_BIT) + || !(sregs->efer & EFER_LMA)) + return -EINVAL; + } else { + /* + * Not in 64-bit mode: EFER.LMA is clear and the code + * segment cannot be 64-bit. + */ + if (sregs->efer & EFER_LMA || sregs->cs.l) + return -EINVAL; + } + + return 0; +} + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -7507,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, (sregs->cr4 & X86_CR4_OSXSAVE)) return -EINVAL; + if (kvm_valid_sregs(vcpu, sregs)) + return -EINVAL; + apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) @@ -7704,32 +7729,25 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +/* Swap (qemu) user FPU context for the guest FPU context. */ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (vcpu->guest_fpu_loaded) - return; - - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - vcpu->guest_fpu_loaded = 1; - __kernel_fpu_begin(); + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, ~XFEATURE_MASK_PKRU); + preempt_enable(); trace_kvm_fpu(1); } +/* When vcpu_run ends, restore user space FPU context. */ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; + preempt_disable(); copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - __kernel_fpu_end(); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -7846,7 +7864,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * To avoid have the INIT path from kvm_apic_has_events() that be * called with loaded FPU and does not let userspace fix the state. */ - kvm_put_guest_fpu(vcpu); + if (init_event) + kvm_put_guest_fpu(vcpu); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, XFEATURE_MASK_BNDREGS); if (mpx_state_buffer) @@ -7855,6 +7874,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) XFEATURE_MASK_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + if (init_event) + kvm_load_guest_fpu(vcpu); } if (!init_event) { diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..4846eff7e4c8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) delay = min_t(u64, MWAITX_MAX_LOOPS, loops); /* - * Use cpu_tss as a cacheline-aligned, seldomly + * Use cpu_tss_rw as a cacheline-aligned, seldomly * accessed per-cpu variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index c4d55919fac1..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8e13b8cc6bed..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..b9283cc27622 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + + set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + * + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), + gdt_prot); + + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif + percpu_setup_debug_store(cpu); +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 + unsigned long start, end; + + BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + + /* Careful here: start + PMD_SIZE might wrap around */ + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + setup_cpu_entry_area_ptes(); + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level(m, NULL); + ptdump_walk_pgd_level_debugfs(m, NULL, false); return 0; } @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static struct dentry *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curknl, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curusr, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, - &ptdump_fops); - if (!pe) + dir = debugfs_create_dir("page_tables", NULL); + if (!dir) return -ENOMEM; + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, + &ptdump_fops); + if (!pe_knl) + goto err; + + pe_curknl = debugfs_create_file("current_kernel", 0400, + dir, NULL, &ptdump_curknl_fops); + if (!pe_curknl) + goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pe_curusr = debugfs_create_file("current_user", 0400, + dir, NULL, &ptdump_curusr_fops); + if (!pe_curusr) + goto err; +#endif return 0; +err: + debugfs_remove_recursive(dir); + return -ENOMEM; } static void __exit pt_dump_debug_exit(void) { - debugfs_remove_recursive(pe); + debugfs_remove_recursive(dir); } module_init(pt_dump_debug_init); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..f56902c1f04b 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,68 +44,97 @@ struct addr_marker { unsigned long max_lines; }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 + enum address_markers_idx { USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64 KERNEL_SPACE_NR, LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif -# ifdef CONFIG_X86_ESPFIX64 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif + CPU_ENTRY_AREA_NR, +#ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI + EFI_END_NR, +#endif HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, -#else + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, +#endif + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_END_NR] = { MODULES_END, "End Modules" }, + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { + USER_SPACE_NR = 0, KERNEL_SPACE_NR, VMALLOC_START_NR, VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, -# endif - FIXADDR_START_NR, #endif + CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, }; -/* Address space markers hints */ static struct addr_marker address_markers[] = { - { 0, "User Space" }, -#ifdef CONFIG_X86_64 - { 0x8000000000000000UL, "Kernel Space" }, - { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN - { KASAN_SHADOW_START, "KASAN shadow" }, - { KASAN_SHADOW_END, "KASAN shadow end" }, + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, +#ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, #endif -# ifdef CONFIG_X86_ESPFIX64 - { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI - { EFI_VA_END, "EFI Runtime Services" }, -# endif - { __START_KERNEL_map, "High Kernel Mapping" }, - { MODULES_VADDR, "Modules" }, - { MODULES_END, "End Modules" }, -#else - { PAGE_OFFSET, "Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/*VMALLOC_END*/, "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, -# endif - { 0/*FIXADDR_START*/, "Fixmap Area" }, -#endif - { -1, NULL } /* End of list */ + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } }; +#endif /* !CONFIG_X86_64 */ + /* Multipliers for offsets within the PTEs */ #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) @@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) static const char * const level_name[] = { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; - if (!pgprot_val(prot)) { + if (!(pr & _PAGE_PRESENT)) { /* Not present */ pt_dump_cont_printf(m, dmsg, " "); } else { @@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx) } static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - bool checkwx) + bool checkwx, bool dmesg) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_top_pgt; @@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, if (pgd) { start = pgd; - st.to_dmesg = true; + st.to_dmesg = dmesg; } st.check_wx = checkwx; @@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { - ptdump_walk_pgd_level_core(m, pgd, false); + ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && static_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif + ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = (pgd_t *) &init_top_pgt; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, NULL, true); + ptdump_walk_pgd_level_core(NULL, NULL, true, false); + ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) @@ -525,8 +578,8 @@ static int __init pt_dump_init(void) address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; #endif - return 0; } __initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3321b446b66c..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@ #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <xen/xen.h> #include <asm/fpu/internal.h> #include <asm/traps.h> @@ -82,7 +83,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, return true; } -EXPORT_SYMBOL_GPL(ex_handler_refcount); +EXPORT_SYMBOL(ex_handler_refcount); /* * Handler for when we fail to restore a task's FPU state. We should never get @@ -212,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * Old CPUs leave the high bits of CS on the stack * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. + * Xen pv domains are not using the default __KERNEL_CS. */ - if (regs->cs != __KERNEL_CS) + if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78ca9a8ee454..06fe3d51d385 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -701,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, else printk(KERN_CONT "paging request"); - printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_CONT " at %px\n", (void *) address); printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6fdf91ef130a..8ca324d07282 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@ #include <asm/kaslr.h> #include <asm/hypervisor.h> #include <asm/cpufeature.h> +#include <asm/pti.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -160,6 +161,12 @@ struct map_range { static int page_size_mask; +static void enable_global_pages(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + __supported_pte_mask |= _PAGE_GLOBAL; +} + static void __init probe_page_size_mask(void) { /* @@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } else - __supported_pte_mask &= ~_PAGE_GLOBAL; + enable_global_pages(); + } /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { @@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void) static void setup_pcid(void) { -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCID)) { - if (boot_cpu_has(X86_FEATURE_PGE)) { - /* - * This can't be cr4_set_bits_and_update_boot() -- - * the trampoline code can't handle CR4.PCIDE and - * it wouldn't do any good anyway. Despite the name, - * cr4_set_bits_and_update_boot() doesn't actually - * cause the bits in question to remain set all the - * way through the secondary boot asm. - * - * Instead, we brute-force it and set CR4.PCIDE - * manually in start_secondary(). - */ - cr4_set_bits(X86_CR4_PCIDE); - } else { - /* - * flush_tlb_all(), as currently implemented, won't - * work if PCID is on but PGE is not. Since that - * combination doesn't exist on real hardware, there's - * no reason to try to fully support it, but it's - * polite to avoid corrupting data if we're on - * an improperly configured VM. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); - } + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + + /* + * INVPCID's single-context modes (2/3) only work if we set + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable + * on systems that have X86_CR4_PCIDE clear, or that have + * no INVPCID support at all. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID)) + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); } -#endif } #ifdef CONFIG_X86_32 @@ -622,6 +639,7 @@ void __init init_mem_mapping(void) { unsigned long end; + pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); @@ -845,7 +863,7 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@ #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/page_types.h> +#include <asm/cpu_entry_area.h> #include <asm/init.h> #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void) mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif @@ -777,6 +779,10 @@ void __init mem_init(void) FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, + CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, + CPU_ENTRY_AREA_MAP_SIZE >> 10, + #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr) return; } + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h> extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -277,6 +278,7 @@ void __init kasan_early_init(void) void __init kasan_init(void) { int i; + void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; #ifdef CONFIG_KASAN_INLINE register_die_notifier(&kasan_die_notifier); @@ -321,16 +323,33 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - kasan_mem_to_shadow((void *)__START_KERNEL_map)); + shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + + kasan_populate_zero_shadow(shadow_cpu_entry_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); + (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/error.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/error.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/pte.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/pte.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d9a9e9fc75dd..391b13402e40 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -405,13 +405,13 @@ bool sme_active(void) { return sme_me_mask && !sev_enabled; } -EXPORT_SYMBOL_GPL(sme_active); +EXPORT_SYMBOL(sme_active); bool sev_active(void) { return sme_me_mask && sev_enabled; } -EXPORT_SYMBOL_GPL(sev_active); +EXPORT_SYMBOL(sev_active); static const struct dma_map_ops sev_dma_ops = { .alloc = sev_alloc, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 96d456a94b03..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else + static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/spinlock.h> +#include <asm/cpu_entry_area.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..bce8aea65606 --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,387 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + * Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ + char arg[5]; + int ret; + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) + goto autosel; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_print_if_insecure("disabled on command line."); + return; + } + +autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + return; +enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (!new_p4d_page) + return NULL; + + if (pgd_none(*pgd)) { + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + new_p4d_page = 0; + } + if (new_p4d_page) + free_page(new_p4d_page); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + pud_t *pud; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (!new_pud_page) + return NULL; + + if (p4d_none(*p4d)) { + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + new_pud_page = 0; + } + if (new_pud_page) + free_page(new_pud_page); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + new_pmd_page = 0; + } + if (new_pmd_page) + free_page(new_pmd_page); + } + + return pmd_offset(pud, address); +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pte_t *pte; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + new_pte_page = 0; + } + if (new_pte_page) + free_page(new_pte_page); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end; addr += PMD_SIZE) { + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = pmd_clear_flags(*pmd, clear); + } +} + +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ + pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ + pti_clone_pmds((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, _PAGE_RW); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); + + pti_clone_user_shared(); + pti_clone_entry_text(); + pti_setup_espfix64(); + pti_setup_vsyscall(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != next->context.ctx_id) @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm, 0)); + write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_single(addr); + __flush_tlb_one(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index bb461cfd01ab..526536c81ddc 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -97,7 +97,7 @@ static int __init broadcom_postcore_init(void) * We should get host bridge information from ACPI unless the BIOS * doesn't support it. */ - if (acpi_os_get_root_pointer()) + if (!acpi_disabled && acpi_os_get_root_pointer()) return 0; #endif diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 1e996df687a3..e663d6bf1328 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -665,6 +665,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) unsigned i; u32 base, limit, high; struct resource *res, *conflict; + struct pci_dev *other; + + /* Check that we are the only device of that type */ + other = pci_get_device(dev->vendor, dev->device, NULL); + if (other != dev || + (other = pci_get_device(dev->vendor, dev->device, other))) { + /* This is a multi-socket system, don't touch it for now */ + pci_dev_put(other); + return; + } for (i = 0; i < 8; i++) { pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base); @@ -696,8 +706,13 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->end = 0xfd00000000ull - 1; /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) + while ((conflict = request_resource_conflict(&iomem_resource, res))) { + if (conflict->end >= res->end) { + kfree(res); + return; + } res->start = conflict->end + 1; + } dev_info(&dev->dev, "adding root bus resource %pR\n", res); @@ -714,10 +729,10 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) pci_bus_add_resource(dev->bus, res, 0); } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); #endif diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6a151ce70e86..d87ac96e37ed 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -196,6 +196,9 @@ static pgd_t *efi_pgd; * because we want to avoid inserting EFI region mappings (EFI_VA_END * to EFI_VA_START) into the standard kernel page tables. Everything * else can be shared, see efi_sync_low_kernel_mappings(). + * + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the + * allocation. */ int __init efi_alloc_page_tables(void) { @@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void) return 0; gfp_mask = GFP_KERNEL | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_page(gfp_mask); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); if (!efi_pgd) return -ENOMEM; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index f44c0bc95aa2..8538a6723171 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, local_flush_tlb(); stat->d_alltlb++; } else { - __flush_tlb_one(msg->address); + __flush_tlb_single(msg->address); stat->d_onetlb++; } stat->d_requestee++; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 5f6fd860820a..e4cb9f4cde8a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq, * on the specified blade to allow the sending of MSIs to the specified CPU. */ static int uv_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); return 0; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index c34bd8233f7c..5f64f30873e2 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -905,7 +905,7 @@ static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) /* * UV NMI handler */ -int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) +static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) { struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; int cpu = smp_processor_id(); @@ -1013,7 +1013,7 @@ void uv_nmi_init(void) } /* Setup HUB NMI info */ -void __init uv_nmi_setup_common(bool hubbed) +static void __init uv_nmi_setup_common(bool hubbed) { int size = sizeof(void *) * (1 << NODES_SHIFT); int cpu; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 84fcfde53f8f..a7d966964c6f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -103,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -160,17 +152,19 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ + + /* + * We need to reload TR, which requires that we change the + * GDT entry to indicate "available" first. + * + * XXX: This could probably all be replaced by a call to + * force_reload_TR(). + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); @@ -178,6 +172,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -190,9 +187,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -215,46 +215,52 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); + /* Restore the IDT. */ + load_idt(&ctxt->idt); + /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ -#ifdef CONFIG_X86_32 - load_idt(&ctxt->idt); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + /* + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). + */ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); #else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ + fix_processor_context(); + /* - * segment registers + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); + load_gs_index(ctxt->gs); /* - * sysenter MSRs + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif - fix_processor_context(); - do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 6b830d4cb4c8..de58533d3664 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -57,7 +57,7 @@ static u32 xen_apic_read(u32 reg) return 0; if (reg == APIC_LVR) - return 0x10; + return 0x14; #ifdef CONFIG_X86_32 if (reg == APIC_LDR) return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d669e9d89001..c9081c6671f0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,8 +1,12 @@ +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +#include <linux/bootmem.h> +#endif #include <linux/cpu.h> #include <linux/kexec.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/interface/memory.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +void __init arch_xen_balloon_init(struct resource *hostmem_resource) +{ + struct xen_memory_map memmap; + int rc; + unsigned int i, last_guest_ram; + phys_addr_t max_addr = PFN_PHYS(max_pfn); + struct e820_table *xen_e820_table; + const struct e820_entry *entry; + struct resource *res; + + if (!xen_initial_domain()) + return; + + xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL); + if (!xen_e820_table) + return; + + memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table->entries); + rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap); + if (rc) { + pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc); + goto out; + } + + last_guest_ram = 0; + for (i = 0; i < memmap.nr_entries; i++) { + if (xen_e820_table->entries[i].addr >= max_addr) + break; + if (xen_e820_table->entries[i].type == E820_TYPE_RAM) + last_guest_ram = i; + } + + entry = &xen_e820_table->entries[last_guest_ram]; + if (max_addr >= entry->addr + entry->size) + goto out; /* No unallocated host RAM. */ + + hostmem_resource->start = max_addr; + hostmem_resource->end = entry->addr + entry->size; + + /* + * Mark non-RAM regions between the end of dom0 RAM and end of host RAM + * as unavailable. The rest of that region can be used for hotplug-based + * ballooning. + */ + for (; i < memmap.nr_entries; i++) { + entry = &xen_e820_table->entries[i]; + + if (entry->type == E820_TYPE_RAM) + continue; + + if (entry->addr >= hostmem_resource->end) + break; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + goto out; + + res->name = "Unavailable host RAM"; + res->start = entry->addr; + res->end = (entry->addr + entry->size < hostmem_resource->end) ? + entry->addr + entry->size : hostmem_resource->end; + rc = insert_resource(hostmem_resource, res); + if (rc) { + pr_warn("%s: Can't insert [%llx - %llx) (%d)\n", + __func__, res->start, res->end, rc); + kfree(res); + goto out; + } + } + + out: + kfree(xen_e820_table); +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5b2b3f3f6531..c047f42552e1 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -88,6 +88,8 @@ #include "multicalls.h" #include "pmu.h" +#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ + void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); @@ -622,7 +624,7 @@ static struct trap_array_entry trap_array[] = { { simd_coprocessor_error, xen_simd_coprocessor_error, false }, }; -static bool get_trap_addr(void **addr, unsigned int ist) +static bool __ref get_trap_addr(void **addr, unsigned int ist) { unsigned int nr; bool ist_okay = false; @@ -644,6 +646,14 @@ static bool get_trap_addr(void **addr, unsigned int ist) } } + if (nr == ARRAY_SIZE(trap_array) && + *addr >= (void *)early_idt_handler_array[0] && + *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) { + nr = (*addr - (void *)early_idt_handler_array[0]) / + EARLY_IDT_HANDLER_SIZE; + *addr = (void *)xen_early_idt_handler_array[nr]; + } + if (WARN_ON(ist != 0 && !ist_okay)) return false; @@ -818,7 +828,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) @@ -1250,6 +1260,7 @@ asmlinkage __visible void __init xen_start_kernel(void) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); x86_configure_nx(); /* Get mfn list */ @@ -1262,6 +1273,21 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_gdt(0); xen_init_irq_ops(); + + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because idt_setup_early_handler needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); + + idt_setup_early_handler(); + xen_init_capabilities(); #ifdef CONFIG_X86_LOCAL_APIC @@ -1295,18 +1321,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - - /* - * Setup xen_vcpu early because start_kernel needs it for - * local_irq_disable(), irqs_disabled(). - * - * Don't do the full vcpu_info placement stuff until we have - * the cpu_possible_mask and a non-dummy shared_info. - */ - xen_vcpu_info_reset(0); - WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index fc048ec686e7..4d62c071b166 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1902,6 +1902,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* + * Zap execute permission from the ident map. Due to the sharing of + * L1 entries we need to do this in the L2. + */ + if (__supported_pte_mask & _PAGE_NX) { + for (i = 0; i < PTRS_PER_PMD; ++i) { + if (pmd_none(level2_ident_pgt[i])) + continue; + level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX); + } + } + /* Copy the initial P->M table mappings if necessary. */ i = pgd_index(xen_start_info->mfn_list); if (i && i < pgd_index(__START_KERNEL_map)) @@ -2261,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) switch (idx) { case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: - case FIX_RO_IDT: #ifdef CONFIG_X86_32 case FIX_WP_TEST: # ifdef CONFIG_HIGHMEM @@ -2272,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: - case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: /* All local page mappings */ pte = pfn_pte(phys, prot); break; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c114ca767b3b..6e0d2086eacb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -808,7 +808,6 @@ char * __init xen_memory_setup(void) addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; while (i < xen_e820_table.nr_entries) { - bool discard = false; chunk_size = size; type = xen_e820_table.entries[i].type; @@ -824,11 +823,10 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - discard = true; + type = E820_TYPE_UNUSABLE; } - if (!discard) - xen_align_and_add_e820_region(addr, chunk_size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 8a10c9a9e2b5..417b339e5c8e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@ #include <xen/interface/xen.h> +#include <linux/init.h> #include <linux/linkage.h> .macro xen_pv_trap name @@ -54,6 +55,19 @@ xen_pv_trap entry_INT80_compat #endif xen_pv_trap hypervisor_callback + __INIT +ENTRY(xen_early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + pop %rcx + pop %r11 + jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE + i = i + 1 + .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +END(xen_early_idt_handler_array) + __FINIT + hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: |