diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2020-06-11 16:17:57 +0300 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2020-06-11 16:17:57 +0300 |
commit | f77d26a9fc525286bcef3d4f98b52e17482cf49c (patch) | |
tree | 6b179c9aa84787773cb601a14a64255e2912154b /arch/x86 | |
parent | b6bea24d41519e8c31e4798f1c1a3f67e540c5d0 (diff) | |
parent | f0178fc01fe46bab6a95415f5647d1a74efcad1b (diff) | |
download | linux-f77d26a9fc525286bcef3d4f98b52e17482cf49c.tar.xz |
Merge branch 'x86/entry' into ras/core
to fixup conflicts in arch/x86/kernel/cpu/mce/core.c so MCE specific follow
up patches can be applied without creating a horrible merge conflict
afterwards.
Diffstat (limited to 'arch/x86')
374 files changed, 9341 insertions, 7500 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d6104ea8af0..a16c45460f1b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -59,7 +59,9 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FILTER_PGPROT @@ -68,6 +70,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL @@ -80,6 +83,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_DEBUG_WX select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -91,6 +95,7 @@ config X86 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_SYM_ANNOTATIONS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT @@ -149,7 +154,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 - select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD + select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS @@ -176,7 +181,6 @@ config X86 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT - select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP @@ -190,7 +194,6 @@ config X86 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 - select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_MOVE_PMD @@ -595,7 +598,7 @@ config X86_INTEL_MID select I2C select DW_APB_TIMER select APB_TIMER - select INTEL_SCU_IPC + select INTEL_SCU_PCI select MFD_INTEL_MSIC ---help--- Select to build a kernel capable of supporting Intel MID (Mobile @@ -1520,6 +1523,7 @@ config X86_CPA_STATISTICS config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD + select DMA_COHERENT_POOL select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED @@ -1582,15 +1586,6 @@ config X86_64_ACPI_NUMA ---help--- Enable ACPI SRAT based node topology detection. -# Some NUMA nodes have memory ranges that span -# other nodes. Even though a pfn is valid and -# between a node's start and end pfns, it may not -# reside on that node. See memmap_init_zone() -# for details. -config NODES_SPAN_OTHER_NODES - def_bool y - depends on X86_64_ACPI_NUMA - config NUMA_EMU bool "NUMA emulation" depends on NUMA @@ -1610,19 +1605,10 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config ARCH_HAVE_MEMORY_PRESENT - def_bool y - depends on X86_32 && DISCONTIGMEM - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA -config ARCH_DISCONTIGMEM_ENABLE - def_bool n - depends on NUMA && X86_32 - depends on BROKEN - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD @@ -1887,10 +1873,10 @@ config X86_UMIP results are dummy. config X86_INTEL_MEMORY_PROTECTION_KEYS - prompt "Intel Memory Protection Keys" + prompt "Memory Protection Keys" def_bool y # Note: only available in 64-bit mode - depends on CPU_SUP_INTEL && X86_64 + depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD) select ARCH_USES_HIGH_VMA_FLAGS select ARCH_HAS_PKEYS ---help--- diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 13de0db38d4e..26b8c08e2fc4 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -15,3 +15,7 @@ config AS_SHA256_NI def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) help Supported by binutils >= 2.24 and LLVM integrated assembler +config AS_TPAUSE + def_bool $(as-instr,tpause %ecx) + help + Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e74690b028a..fdf1431ac8c2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -72,42 +72,6 @@ config EFI_PGT_DUMP issues with the mapping of the EFI runtime regions into that table. -config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE - ---help--- - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Look for a message in dmesg output like this: - - x86/mm: Checked W+X mappings: passed, no W+X pages found. - - or like this, if the check failed: - - x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - -config DOUBLEFAULT - default y - bool "Enable doublefault exception handler" if EXPERT && X86_32 - ---help--- - This option allows trapping of rare doublefault exceptions that - would otherwise cause a system to silently reboot. Disabling this - option saves about 4k and might cause you much additional grey - hair. - config DEBUG_TLBFLUSH bool "Set upper limit of TLB entries to flush one-by-one" depends on DEBUG_KERNEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b65ec63c7db7..00e378de8bc0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -246,7 +246,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/ boot := arch/x86/boot -BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage +BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage PHONY += bzImage $(BOOT_TARGETS) @@ -267,8 +267,8 @@ endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ -PHONY += install -install: +PHONY += install bzlilo +install bzlilo: $(Q)$(MAKE) $(build)=$(boot) $@ PHONY += vdso_install diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index e17be90ab312..4c5355684321 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -57,11 +57,10 @@ $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ cmd_cpustr = $(obj)/mkcpustr > $@ -targets += cpustr.h $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) endif -clean-files += cpustr.h +targets += cpustr.h # --------------------------------------------------------------------------- @@ -129,6 +128,8 @@ quiet_cmd_genimage = GENIMAGE $3 cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) +PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install + # This requires write access to /dev/fd0 bzdisk: $(obj)/bzImage $(obj)/mtools.conf $(call cmd,genimage,bzdisk,/dev/fd0) @@ -146,7 +147,7 @@ isoimage: $(obj)/bzImage $(call cmd,genimage,isoimage,$(obj)/image.iso) @$(kecho) 'Kernel: $(obj)/image.iso is ready' -bzlilo: $(obj)/bzImage +bzlilo: if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ef2ad7253cd5..8bcbcee54aa1 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -280,9 +280,9 @@ acpi_physical_address get_rsdp_addr(void) */ #define MAX_ADDR_LEN 19 -static acpi_physical_address get_cmdline_acpi_rsdp(void) +static unsigned long get_cmdline_acpi_rsdp(void) { - acpi_physical_address addr = 0; + unsigned long addr = 0; #ifdef CONFIG_KEXEC char val[MAX_ADDR_LEN] = { }; @@ -292,7 +292,7 @@ static acpi_physical_address get_cmdline_acpi_rsdp(void) if (ret < 0) return 0; - if (kstrtoull(val, 16, &addr)) + if (boot_kstrtoul(val, 16, &addr)) return 0; #endif return addr; @@ -314,7 +314,6 @@ static unsigned long get_acpi_srat_table(void) * different ideas about whether to trust a command-line parameter. */ rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); - if (!rsdp) rsdp = (struct acpi_table_rsdp *)(long) boot_params->acpi_rsdp_addr; diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index 2b2049259619..c4bb0f9363f5 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -28,8 +28,6 @@ SYM_FUNC_START(__efi64_thunk) push %rbx leaq 1f(%rip), %rbp - leaq efi_gdt64(%rip), %rbx - movl %ebx, 2(%rbx) /* Fixup the gdt base address */ movl %ds, %eax push %rax @@ -48,7 +46,8 @@ SYM_FUNC_START(__efi64_thunk) movl %r8d, 0xc(%rsp) movl %r9d, 0x10(%rsp) - sgdt 0x14(%rsp) + leaq 0x14(%rsp), %rbx + sgdt (%rbx) /* * Switch to gdt with 32-bit segments. This is the firmware GDT @@ -68,8 +67,7 @@ SYM_FUNC_START(__efi64_thunk) pushq %rax lretq -1: lgdt 0x14(%rsp) - addq $32, %rsp +1: addq $32, %rsp movq %rdi, %rax pop %rbx @@ -175,14 +173,3 @@ SYM_DATA_END(efi32_boot_cs) SYM_DATA_START(efi32_boot_ds) .word 0 SYM_DATA_END(efi32_boot_ds) - -SYM_DATA_START(efi_gdt64) - .word efi_gdt64_end - efi_gdt64 - .long 0 /* Filled out by user */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -SYM_DATA_END_LABEL(efi_gdt64, SYM_L_LOCAL, efi_gdt64_end) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index ab3307036ba4..03557f2174bf 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -49,16 +49,17 @@ * Position Independent Executable (PIE) so that linker won't optimize * R_386_GOT32X relocation to its fixed symbol address. Older * linkers generate R_386_32 relocations against locally defined symbols, - * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less + * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle * R_386_32 relocations when relocating the kernel. To generate - * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as + * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as * hidden: */ .hidden _bss .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD SYM_FUNC_START(startup_32) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4f7e6b84be07..e821a7d7d5c4 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -42,6 +42,7 @@ .hidden _ebss .hidden _got .hidden _egot + .hidden _end __HEAD .code32 @@ -393,6 +394,14 @@ SYM_CODE_START(startup_64) addq %rax, 2(%rax) lgdt (%rax) + /* Reload CS so IRET returns to a CS actually in the GDT */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c index 9557c5a15b91..f9c5c13d979b 100644 --- a/arch/x86/boot/compressed/kaslr_64.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -22,8 +22,8 @@ #include "misc.h" /* These actually do the work of building the kernel identity maps. */ +#include <linux/pgtable.h> #include <asm/init.h> -#include <asm/pgtable.h> /* Use the static base for this part of the boot process */ #undef __PAGE_OFFSET #define __PAGE_OFFSET __PAGE_OFFSET_BASE diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 508cfa6828c5..8f1025d1f681 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS _data = . ; *(.data) *(.data.*) + *(.bss.efistub) _edata = . ; } . = ALIGN(L1_CACHE_BYTES); @@ -73,4 +74,6 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; + + DISCARDS } diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 8272a4492844..8a3fff9128bb 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -117,7 +117,6 @@ static unsigned int simple_guess_base(const char *cp) * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ - unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { unsigned long long result = 0; @@ -335,3 +334,45 @@ int kstrtoull(const char *s, unsigned int base, unsigned long long *res) s++; return _kstrtoull(s, base, res); } + +static int _kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + +/** + * kstrtoul - convert a string to an unsigned long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the simple_strtoull. + */ +int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + /* + * We want to shortcut function call, but + * __builtin_types_compatible_p(unsigned long, unsigned long long) = 0. + */ + if (sizeof(unsigned long) == sizeof(unsigned long long) && + __alignof__(unsigned long) == __alignof__(unsigned long long)) + return kstrtoull(s, base, (unsigned long long *)res); + else + return _kstrtoul(s, base, res); +} diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 38d8f2f5e47e..995f7b7ad512 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -30,4 +30,5 @@ extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); int kstrtoull(const char *s, unsigned int base, unsigned long long *res); +int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res); #endif /* BOOT_STRING_H */ diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 8f8c8e386cea..c8b8c1a8d1fc 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -59,14 +59,14 @@ u8 buf[SETUP_SECT_MAX*512]; #define PECOFF_COMPAT_RESERVE 0x0 #endif -unsigned long efi32_stub_entry; -unsigned long efi64_stub_entry; -unsigned long efi_pe_entry; -unsigned long efi32_pe_entry; -unsigned long kernel_info; -unsigned long startup_64; -unsigned long _ehead; -unsigned long _end; +static unsigned long efi32_stub_entry; +static unsigned long efi64_stub_entry; +static unsigned long efi_pe_entry; +static unsigned long efi32_pe_entry; +static unsigned long kernel_info; +static unsigned long startup_64; +static unsigned long _ehead; +static unsigned long _end; /*----------------------------------------------------------------------*/ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index cad6e1bfa7d5..54e7d15dbd0d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2758,7 +2758,7 @@ SYM_FUNC_START(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2803,7 +2803,7 @@ SYM_FUNC_START(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 06ef2d4a4701..6737bcea1fa1 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { blake2s_compress_generic(state, block, nblocks, inc); return; } - for (;;) { + do { const size_t blocks = min_t(size_t, nblocks, - PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && @@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state, kernel_fpu_end(); nblocks -= blocks; - if (!nblocks) - break; block += blocks * BLAKE2S_BLOCK_SIZE; - } + } while (nblocks); } EXPORT_SYMBOL(blake2s_compress_arch); diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index d01ddd73de65..ecc0a9a905c4 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -1228,7 +1228,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 563ef6e83cdd..0907243c501c 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -1339,7 +1339,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index b412c21ee06e..22250091cdbe 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -153,9 +153,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, bytes, nrounds); - kernel_fpu_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 0e6690e3618c..8501ec4532f4 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -75,7 +75,7 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp %rdi +#define bufp rdi #define bufp_dw %edi #define bufp_w %di #define bufp_b %dil @@ -105,9 +105,9 @@ SYM_FUNC_START(crc_pcl) ## 1) ALIGN: ################################################################ - mov bufp, bufptmp # rdi = *buf - neg bufp - and $7, bufp # calculate the unalignment amount of + mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of # the address je proc_block # Skip if aligned @@ -123,13 +123,13 @@ SYM_FUNC_START(crc_pcl) do_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer - add bufp, bufptmp # align buffer pointer for quadword + add %bufp, bufptmp # align buffer pointer for quadword # processing - sub bufp, len # update buffer length + sub %bufp, len # update buffer length align_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte - dec bufp + dec %bufp jne align_loop proc_block: @@ -169,10 +169,10 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), bufp - movzxw (bufp, %rax, 2), len - lea crc_array(%rip), bufp - lea (bufp, len, 1), bufp + lea jump_table(%rip), %bufp + movzxw (%bufp, %rax, 2), len + lea crc_array(%rip), %bufp + lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp ################################################################ @@ -218,9 +218,9 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) subq %rax, tmp # tmp -= rax*24 diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index f7567cbd35b6..80fcb85736e1 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -29,7 +29,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index a661ede3b5cf..cc6b7c1a2705 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -29,7 +29,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 6dfec19f7d57..dfe921efa9b2 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, struct poly1305_arch_internal *state = ctx; /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || - PAGE_SIZE % POLY1305_BLOCK_SIZE); + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); if (!static_branch_likely(&poly1305_use_avx) || (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || @@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, return; } - for (;;) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); + do { + const size_t bytes = min_t(size_t, len, SZ_4K); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) @@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, else poly1305_blocks_avx(ctx, inp, bytes, padbit); kernel_fpu_end(); + len -= bytes; - if (!len) - break; inp += bytes; - } + } while (len); } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index a801ffc10cbb..18200135603f 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -21,7 +21,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 6394b5fe8db6..dd06249229e1 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -34,7 +34,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 82cc1b3ced1d..b0b05c93409e 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha.h> diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 85eb381259c2..b7a5790d8d63 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,7 +3,13 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 0789e13ece90..4208c1e3f601 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 - /* - * Push registers and sanitize registers of values that a - * speculation attack might otherwise want to exploit. The - * lower registers are likely clobbered well before they - * could be put to use in a speculative execution gadget. - * Interleave XOR with PUSH for better uop scheduling: - */ .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -114,34 +107,43 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ - xorl %ecx, %ecx /* nospec cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ - xorl %r8d, %r8d /* nospec r8 */ pushq %r9 /* pt_regs->r9 */ - xorl %r9d, %r9d /* nospec r9 */ pushq %r10 /* pt_regs->r10 */ - xorl %r10d, %r10d /* nospec r10 */ pushq %r11 /* pt_regs->r11 */ - xorl %r11d, %r11d /* nospec r11*/ pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx*/ pushq %rbp /* pt_regs->rbp */ - xorl %ebp, %ebp /* nospec rbp*/ pushq %r12 /* pt_regs->r12 */ - xorl %r12d, %r12d /* nospec r12*/ pushq %r13 /* pt_regs->r13 */ - xorl %r13d, %r13d /* nospec r13*/ pushq %r14 /* pt_regs->r14 */ - xorl %r14d, %r14d /* nospec r14*/ pushq %r15 /* pt_regs->r15 */ - xorl %r15d, %r15d /* nospec r15*/ UNWIND_HINT_REGS + .if \save_ret pushq %rsi /* return address on top of stack */ .endif + + /* + * Sanitize registers of values that a speculation attack might + * otherwise want to exploit. The lower registers are likely clobbered + * well before they could be put to use in a speculative execution + * gadget. + */ + xorl %edx, %edx /* nospec dx */ + xorl %ecx, %ecx /* nospec cx */ + xorl %r8d, %r8d /* nospec r8 */ + xorl %r9d, %r9d /* nospec r9 */ + xorl %r10d, %r10d /* nospec r10 */ + xorl %r11d, %r11d /* nospec r11 */ + xorl %ebx, %ebx /* nospec rbx */ + xorl %ebp, %ebp /* nospec rbp */ + xorl %r12d, %r12d /* nospec r12 */ + xorl %r13d, %r13d /* nospec r13 */ + xorl %r14d, %r14d /* nospec r14 */ + xorl %r15d, %r15d /* nospec r15 */ + .endm .macro POP_REGS pop_rdi=1 skip_r11rcx=0 @@ -339,30 +341,13 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#endif /* CONFIG_X86_64 */ +#else /* CONFIG_X86_64 */ +# undef UNWIND_HINT_IRET_REGS +# define UNWIND_HINT_IRET_REGS +#endif /* !CONFIG_X86_64 */ .macro STACKLEAK_ERASE #ifdef CONFIG_GCC_PLUGIN_STACKLEAK call stackleak_erase #endif .endm - -/* - * This does 'call enter_from_user_mode' unless we can avoid it based on - * kernel config or using the static jump infrastructure. - */ -.macro CALL_enter_from_user_mode -#ifdef CONFIG_CONTEXT_TRACKING -#ifdef CONFIG_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0 -#endif - call enter_from_user_mode -.Lafter_call_\@: -#endif -.endm - -#ifdef CONFIG_PARAVIRT_XXL -#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg -#else -#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg -#endif diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 76735ec813e6..f4d57782c14b 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -27,6 +27,11 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> +#ifdef CONFIG_XEN_PV +#include <xen/xen-ops.h> +#include <xen/events.h> +#endif + #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> @@ -35,21 +40,67 @@ #include <asm/nospec-branch.h> #include <asm/io_bitmap.h> #include <asm/syscall.h> +#include <asm/irq_stack.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> #ifdef CONFIG_CONTEXT_TRACKING -/* Called on entry from user mode with IRQs off. */ -__visible inline void enter_from_user_mode(void) +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall entry disables interrupts, but user mode is traced as interrupts + * enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + */ +static noinstr void enter_from_user_mode(void) { - CT_WARN_ON(ct_state() != CONTEXT_USER); + enum ctx_state state = ct_state(); + + lockdep_hardirqs_off(CALLER_ADDR0); user_exit_irqoff(); + + instrumentation_begin(); + CT_WARN_ON(state != CONTEXT_USER); + trace_hardirqs_off_finish(); + instrumentation_end(); } #else -static inline void enter_from_user_mode(void) {} +static __always_inline void enter_from_user_mode(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} #endif +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall exit enables interrupts, but the kernel state is interrupts + * disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. + * 4) Tell lockdep that interrupts are enabled + */ +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + user_enter_irqoff(); + mds_user_clear_cpu_buffers(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) { #ifdef CONFIG_X86_64 @@ -179,8 +230,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) } } -/* Called with IRQs disabled. */ -__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +static void __prepare_exit_to_usermode(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags; @@ -219,10 +269,14 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) */ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif +} - user_enter_irqoff(); - - mds_user_clear_cpu_buffers(); +__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs) +{ + instrumentation_begin(); + __prepare_exit_to_usermode(regs); + instrumentation_end(); + exit_to_user_mode(); } #define SYSCALL_EXIT_WORK_FLAGS \ @@ -251,11 +305,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) tracehook_report_syscall_exit(regs, step); } -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible inline void syscall_return_slowpath(struct pt_regs *regs) +static void __syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); @@ -276,15 +326,29 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) syscall_slow_exit_work(regs, cached_flags); local_irq_disable(); - prepare_exit_to_usermode(regs); + __prepare_exit_to_usermode(regs); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible noinstr void syscall_return_slowpath(struct pt_regs *regs) +{ + instrumentation_begin(); + __syscall_return_slowpath(regs); + instrumentation_end(); + exit_to_user_mode(); } #ifdef CONFIG_X86_64 -__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) +__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; enter_from_user_mode(); + instrumentation_begin(); + local_irq_enable(); ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) @@ -301,8 +365,10 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) regs->ax = x32_sys_call_table[nr](regs); #endif } + __syscall_return_slowpath(regs); - syscall_return_slowpath(regs); + instrumentation_end(); + exit_to_user_mode(); } #endif @@ -313,7 +379,7 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +static void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; @@ -337,27 +403,62 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) regs->ax = ia32_sys_call_table[nr](regs); } - syscall_return_slowpath(regs); + __syscall_return_slowpath(regs); } /* Handles int $0x80 */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { enter_from_user_mode(); + instrumentation_begin(); + local_irq_enable(); do_syscall_32_irqs_on(regs); + + instrumentation_end(); + exit_to_user_mode(); +} + +static bool __do_fast_syscall_32(struct pt_regs *regs) +{ + int res; + + /* Fetch EBP from where the vDSO stashed it. */ + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Micro-optimization: the pointer we're following is + * explicitly 32 bits, so it can't be out of range. + */ + res = __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } else { + res = get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } + + if (res) { + /* User code screwed up. */ + regs->ax = -EFAULT; + local_irq_disable(); + __prepare_exit_to_usermode(regs); + return false; + } + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs); + return true; } /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible long do_fast_syscall_32(struct pt_regs *regs) +__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + - vdso_image_32.sym_int80_landing_pad; + vdso_image_32.sym_int80_landing_pad; + bool success; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -367,33 +468,17 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip = landing_pad; enter_from_user_mode(); + instrumentation_begin(); local_irq_enable(); + success = __do_fast_syscall_32(regs); - /* Fetch EBP from where the vDSO stashed it. */ - if ( -#ifdef CONFIG_X86_64 - /* - * Micro-optimization: the pointer we're following is explicitly - * 32 bits, so it can't be out of range. - */ - __get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#else - get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#endif - ) { - - /* User code screwed up. */ - local_irq_disable(); - regs->ax = -EFAULT; - prepare_exit_to_usermode(regs); - return 0; /* Keep it simple: use IRET. */ - } + instrumentation_end(); + exit_to_user_mode(); - /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + /* If it failed, keep it simple: use IRET. */ + if (!success) + return 0; #ifdef CONFIG_X86_64 /* @@ -431,3 +516,245 @@ SYSCALL_DEFINE0(ni_syscall) { return -ENOSYS; } + +/** + * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional + * RCU handling + * @regs: Pointer to pt_regs of interrupted context + * + * Invokes: + * - lockdep irqflag state tracking as low level ASM entry disabled + * interrupts. + * + * - Context tracking if the exception hit user mode. + * + * - The hardirq tracer to keep the state consistent as low level ASM + * entry disabled interrupts. + * + * For kernel mode entries RCU handling is done conditional. If RCU is + * watching then the only RCU requirement is to check whether the tick has + * to be restarted. If RCU is not watching then rcu_irq_enter() has to be + * invoked on entry and rcu_irq_exit() on exit. + * + * Avoiding the rcu_irq_enter/exit() calls is an optimization but also + * solves the problem of kernel mode pagefaults which can schedule, which + * is not possible after invoking rcu_irq_enter() without undoing it. + * + * For user mode entries enter_from_user_mode() must be invoked to + * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit + * would not be possible. + * + * Returns: True if RCU has been adjusted on a kernel entry + * False otherwise + * + * The return value must be fed into the rcu_exit argument of + * idtentry_exit_cond_rcu(). + */ +bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) +{ + if (user_mode(regs)) { + enter_from_user_mode(); + return false; + } + + if (!__rcu_is_watching()) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in enter_from_user_mode(). + * + * This only happens for IRQs that hit the idle + * loop, i.e. if idle is not using MWAIT. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + return true; + } + + /* + * If RCU is watching then RCU only wants to check + * whether it needs to restart the tick in NOHZ + * mode. + */ + instrumentation_begin(); + rcu_irq_enter_check_tick(); + /* Use the combo lockdep/tracing function */ + trace_hardirqs_off(); + instrumentation_end(); + + return false; +} + +static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) +{ + if (may_sched && !preempt_count()) { + /* Sanity check RCU and thread stack */ + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); + if (need_resched()) + preempt_schedule_irq(); + } + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); +} + +/** + * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU + * handling + * @regs: Pointer to pt_regs (exception entry regs) + * @rcu_exit: Invoke rcu_irq_exit() if true + * + * Depending on the return target (kernel/user) this runs the necessary + * preemption and work checks if possible and reguired and returns to + * the caller with interrupts disabled and no further work pending. + * + * This is the last action before returning to the low level ASM code which + * just needs to return to the appropriate context. + * + * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry + * function must be fed into the @rcu_exit argument. + */ +void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) +{ + lockdep_assert_irqs_disabled(); + + /* Check whether this returns to user mode */ + if (user_mode(regs)) { + prepare_exit_to_usermode(regs); + } else if (regs->flags & X86_EFLAGS_IF) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (rcu_exit) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (rcu_exit) + rcu_irq_exit(); + } +} + +/** + * idtentry_enter_user - Handle state tracking on idtentry from user mode + * @regs: Pointer to pt_regs of interrupted context + * + * Invokes enter_from_user_mode() to establish the proper context for + * NOHZ_FULL. Otherwise scheduling on exit would not be possible. + */ +void noinstr idtentry_enter_user(struct pt_regs *regs) +{ + enter_from_user_mode(); +} + +/** + * idtentry_exit_user - Handle return from exception to user mode + * @regs: Pointer to pt_regs (exception entry regs) + * + * Runs the necessary preemption and work checks and returns to the caller + * with interrupts disabled and no further work pending. + * + * This is the last action before returning to the low level ASM code which + * just needs to return to the appropriate context. + * + * Counterpart to idtentry_enter_user(). + */ +void noinstr idtentry_exit_user(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + prepare_exit_to_usermode(regs); +} + +#ifdef CONFIG_XEN_PV +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} +#else +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } +#endif + +static void __xen_pv_evtchn_do_upcall(void) +{ + irq_enter_rcu(); + inc_irq_stat(irq_hv_callback_count); + + xen_hvm_evtchn_do_upcall(); + + irq_exit_rcu(); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + bool inhcall, rcu_exit; + + rcu_exit = idtentry_enter_cond_rcu(regs); + old_regs = set_irq_regs(regs); + + instrumentation_begin(); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); + instrumentation_begin(); + + set_irq_regs(old_regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + instrumentation_begin(); + idtentry_exit_cond_resched(regs, true); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + idtentry_exit_cond_rcu(regs, rcu_exit); + } +} +#endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..024d7d276cd4 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,40 +44,13 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/frame.h> +#include <asm/trapnr.h> #include <asm/nospec-branch.h> #include "calling.h" .section .entry.text, "ax" -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPTION -# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -# define preempt_stop(clobbers) -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - #define PTI_SWITCH_MASK (1 << PAGE_SHIFT) /* @@ -726,10 +699,68 @@ .Lend_\@: .endm + +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + ASM_CLAC + cld + + .if \has_error_code == 0 + pushl $0 /* Clear the error code */ + .endif + + /* Push the C-function address into the GS slot */ + pushl $\cfunc + /* Invoke the common exception entry */ + jmp handle_exception +SYM_CODE_END(\asmsym) +.endm + +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT +SYM_CODE_START_LOCAL(asm_\cfunc) + ASM_CLAC + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + movl %esp, %eax + movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ + movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ + call \cfunc + jmp handle_exception_return +SYM_CODE_END(asm_\cfunc) +.endm + +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + +/* + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. + */ + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: + +#include <asm/idtentry.h> + + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: + /* * %eax: prev task * %edx: next task */ +.pushsection .text, "ax" SYM_CODE_START(__switch_to_asm) /* * Save callee-saved registers @@ -776,6 +807,7 @@ SYM_CODE_START(__switch_to_asm) jmp __switch_to SYM_CODE_END(__switch_to_asm) +.popsection /* * The unwinder expects the last frame on the stack to always be at the same @@ -784,6 +816,7 @@ SYM_CODE_END(__switch_to_asm) * asmlinkage function so its argument has to be pushed on the stack. This * wrapper creates a proper "end of stack" frame header before the call. */ +.pushsection .text, "ax" SYM_FUNC_START(schedule_tail_wrapper) FRAME_BEGIN @@ -794,6 +827,8 @@ SYM_FUNC_START(schedule_tail_wrapper) FRAME_END ret SYM_FUNC_END(schedule_tail_wrapper) +.popsection + /* * A newly forked process directly context switches into this address. * @@ -801,6 +836,7 @@ SYM_FUNC_END(schedule_tail_wrapper) * ebx: kernel thread func (NULL for user thread) * edi: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) call schedule_tail_wrapper @@ -811,12 +847,11 @@ SYM_CODE_START(ret_from_fork) /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath - STACKLEAK_ERASE - jmp restore_all + jmp .Lsyscall_32_done /* kernel thread */ 1: movl %edi, %eax - CALL_NOSPEC %ebx + CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -825,38 +860,7 @@ SYM_CODE_START(ret_from_fork) movl $0, PT_EAX(%esp) jmp 2b SYM_CODE_END(ret_from_fork) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing -SYM_CODE_START_LOCAL(ret_from_exception) - preempt_stop(CLBR_ANY) -ret_from_intr: -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb restore_all_kernel # not returning to v8086 or userspace - - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl %esp, %eax - call prepare_exit_to_usermode - jmp restore_all -SYM_CODE_END(ret_from_exception) +.popsection SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) /* @@ -960,12 +964,6 @@ SYM_FUNC_START(entry_SYSENTER_32) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax call do_fast_syscall_32 /* XEN PV guests always use IRET path */ @@ -974,8 +972,7 @@ SYM_FUNC_START(entry_SYSENTER_32) STACKLEAK_ERASE -/* Opportunistic SYSEXIT */ - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + /* Opportunistic SYSEXIT */ /* * Setup entry stack - we keep the pointer in %eax and do the @@ -1075,20 +1072,12 @@ SYM_FUNC_START(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ - /* - * User mode is traced as though IRQs are on, and the interrupt gate - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax call do_int80_syscall_32 .Lsyscall_32_done: - STACKLEAK_ERASE -restore_all: - TRACE_IRQS_ON +restore_all_switch_stack: SWITCH_TO_ENTRY_STACK CHECK_AND_APPLY_ESPFIX @@ -1107,26 +1096,10 @@ restore_all: */ INTERRUPT_RETURN -restore_all_kernel: -#ifdef CONFIG_PREEMPTION - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz .Lno_preempt - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz .Lno_preempt - call preempt_schedule_irq -.Lno_preempt: -#endif - TRACE_IRQS_IRET - PARANOID_EXIT_TO_KERNEL_MODE - BUG_IF_WRONG_CR3 - RESTORE_REGS 4 - jmp .Lirq_return - .section .fixup, "ax" -SYM_CODE_START(iret_exc) +SYM_CODE_START(asm_iret_error) pushl $0 # no error code - pushl $do_iret_error + pushl $iret_error #ifdef CONFIG_DEBUG_ENTRY /* @@ -1140,10 +1113,10 @@ SYM_CODE_START(iret_exc) popl %eax #endif - jmp common_exception -SYM_CODE_END(iret_exc) + jmp handle_exception +SYM_CODE_END(asm_iret_error) .previous - _ASM_EXTABLE(.Lirq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, asm_iret_error) SYM_FUNC_END(entry_INT80_32) .macro FIXUP_ESPFIX_STACK @@ -1193,192 +1166,21 @@ SYM_FUNC_END(entry_INT80_32) #endif .endm -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -SYM_CODE_END(irq_entries_start) - -#ifdef CONFIG_X86_LOCAL_APIC - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_spurious - .align 8 - .endr -SYM_CODE_END(spurious_entries_start) - -SYM_CODE_START_LOCAL(common_spurious) - ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - call smp_spurious_interrupt - jmp ret_from_intr -SYM_CODE_END(common_spurious) -#endif - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - call do_IRQ - jmp ret_from_intr -SYM_CODE_END(common_interrupt) - -#define BUILD_INTERRUPT3(name, nr, fn) \ -SYM_FUNC_START(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL switch_stacks=1; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ -SYM_FUNC_END(name) - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - -/* The include is where all of the SMP etc. interrupts come from */ -#include <asm/entry_arch.h> - -SYM_CODE_START(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp common_exception -SYM_CODE_END(coprocessor_error) - -SYM_CODE_START(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp common_exception -SYM_CODE_END(simd_coprocessor_error) - -SYM_CODE_START(device_not_available) - ASM_CLAC - pushl $0 - pushl $do_device_not_available - jmp common_exception -SYM_CODE_END(device_not_available) - #ifdef CONFIG_PARAVIRT SYM_CODE_START(native_iret) iret - _ASM_EXTABLE(native_iret, iret_exc) + _ASM_EXTABLE(native_iret, asm_iret_error) SYM_CODE_END(native_iret) #endif -SYM_CODE_START(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp common_exception -SYM_CODE_END(overflow) - -SYM_CODE_START(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp common_exception -SYM_CODE_END(bounds) - -SYM_CODE_START(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp common_exception -SYM_CODE_END(invalid_op) - -SYM_CODE_START(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp common_exception -SYM_CODE_END(coprocessor_segment_overrun) - -SYM_CODE_START(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp common_exception -SYM_CODE_END(invalid_TSS) - -SYM_CODE_START(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp common_exception -SYM_CODE_END(segment_not_present) - -SYM_CODE_START(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp common_exception -SYM_CODE_END(stack_segment) - -SYM_CODE_START(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp common_exception -SYM_CODE_END(alignment_check) - -SYM_CODE_START(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp common_exception -SYM_CODE_END(divide_error) - -#ifdef CONFIG_X86_MCE -SYM_CODE_START(machine_check) - ASM_CLAC - pushl $0 - pushl $do_mce - jmp common_exception -SYM_CODE_END(machine_check) -#endif - -SYM_CODE_START(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp common_exception -SYM_CODE_END(spurious_interrupt_bug) - #ifdef CONFIG_XEN_PV -SYM_FUNC_START(xen_hypervisor_callback) +/* + * See comment in entry_64.S for further explanation + * + * Note: This is not an actual IDT entry point. It's a XEN specific entry + * point and therefore named to match the 64-bit trampoline counterpart. + */ +SYM_FUNC_START(xen_asm_exc_xen_hypervisor_callback) /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1395,14 +1197,11 @@ SYM_FUNC_START(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL ENCODE_FRAME_POINTER - TRACE_IRQS_OFF + mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -SYM_FUNC_END(xen_hypervisor_callback) + call xen_pv_evtchn_do_upcall + jmp handle_exception_return +SYM_FUNC_END(xen_asm_exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1429,11 +1228,11 @@ SYM_FUNC_START(xen_failsafe_callback) popl %eax lea 16(%esp), %esp jz 5f - jmp iret_exc + jmp asm_iret_error 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL ENCODE_FRAME_POINTER - jmp ret_from_exception + jmp handle_exception_return .section .fixup, "ax" 6: xorl %eax, %eax @@ -1456,56 +1255,7 @@ SYM_FUNC_START(xen_failsafe_callback) SYM_FUNC_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ -#ifdef CONFIG_XEN_PVHVM -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) -#endif - - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_intr) - -BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, - hv_stimer0_vector_handler) - -#endif /* CONFIG_HYPERV */ - -SYM_CODE_START(page_fault) - ASM_CLAC - pushl $do_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(page_fault) - -SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) - /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 - - ENCODE_FRAME_POINTER - - /* fixup %gs */ - GS_TO_REG %ecx - movl PT_GS(%esp), %edi - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - - GET_CR2_INTO(%ecx) # might clobber %eax - - /* fixup orig %eax */ - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - - TRACE_IRQS_OFF - movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi - jmp ret_from_exception -SYM_CODE_END(common_exception_read_cr2) - -SYM_CODE_START_LOCAL_NOALIGN(common_exception) +SYM_CODE_START_LOCAL_NOALIGN(handle_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER @@ -1520,24 +1270,35 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi - jmp ret_from_exception -SYM_CODE_END(common_exception) + CALL_NOSPEC edi -SYM_CODE_START(debug) +handle_exception_return: +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else /* - * Entry from sysenter is now handled in common_exception + * We can be coming here from child spawned by kernel_thread(). */ - ASM_CLAC - pushl $0 - pushl $do_debug - jmp common_exception -SYM_CODE_END(debug) + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax # returning to v8086 or userspace ? + jnb ret_to_user -#ifdef CONFIG_DOUBLEFAULT -SYM_CODE_START(double_fault) + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + +ret_to_user: + movl %esp, %eax + jmp restore_all_switch_stack +SYM_CODE_END(handle_exception) + +SYM_CODE_START(asm_exc_double_fault) 1: /* * This is a task gate handler, not an interrupt gate handler. @@ -1575,8 +1336,7 @@ SYM_CODE_START(double_fault) 1: hlt jmp 1b -SYM_CODE_END(double_fault) -#endif +SYM_CODE_END(asm_exc_double_fault) /* * NMI is doubly nasty. It can happen on the first instruction of @@ -1585,7 +1345,7 @@ SYM_CODE_END(double_fault) * switched stacks. We handle both conditions by simply checking whether we * interrupted kernel code running on the SYSENTER stack. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 @@ -1614,7 +1374,7 @@ SYM_CODE_START(nmi) jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ - call do_nmi + call exc_nmi jmp .Lnmi_return .Lnmi_from_sysenter_stack: @@ -1624,7 +1384,7 @@ SYM_CODE_START(nmi) */ movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - call do_nmi + call exc_nmi movl %ebx, %esp .Lnmi_return: @@ -1678,29 +1438,9 @@ SYM_CODE_START(nmi) lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif -SYM_CODE_END(nmi) - -SYM_CODE_START(int3) - ASM_CLAC - pushl $0 - pushl $do_int3 - jmp common_exception -SYM_CODE_END(int3) - -SYM_CODE_START(general_protection) - ASM_CLAC - pushl $do_general_protection - jmp common_exception -SYM_CODE_END(general_protection) - -#ifdef CONFIG_KVM_GUEST -SYM_CODE_START(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(async_page_fault) -#endif +SYM_CODE_END(asm_exc_nmi) +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1711,3 +1451,4 @@ SYM_CODE_START(rewind_stack_do_exit) call do_exit 1: jmp 1b SYM_CODE_END(rewind_stack_do_exit) +.popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0e9504fabe52..d2a00c97e53f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -16,7 +16,6 @@ * * Some macro usage: * - SYM_FUNC_START/END:Define functions in the symbol table. - * - TRACE_IRQ_*: Trace hardirq state for lock debugging. * - idtentry: Define exception entry points. */ #include <linux/linkage.h> @@ -37,6 +36,7 @@ #include <asm/pgtable_types.h> #include <asm/export.h> #include <asm/frame.h> +#include <asm/trapnr.h> #include <asm/nospec-branch.h> #include <linux/err.h> @@ -53,57 +53,6 @@ SYM_CODE_START(native_usergs_sysret64) SYM_CODE_END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_FLAGS flags:req -#ifdef CONFIG_TRACE_IRQFLAGS - btl $9, \flags /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -.macro TRACE_IRQS_IRETQ - TRACE_IRQS_FLAGS EFLAGS(%rsp) -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - btl $9, EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -144,11 +93,6 @@ SYM_CODE_END(native_usergs_sysret64) SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_EMPTY - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ swapgs /* tss.sp2 is scratch space. */ @@ -167,15 +111,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rax=$-ENOSYS - TRACE_IRQS_OFF - /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ - TRACE_IRQS_ON /* return enables interrupts */ - /* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, @@ -249,7 +189,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* @@ -258,6 +197,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -279,8 +219,8 @@ SYM_CODE_END(entry_SYSCALL_64) * %rdi: prev task * %rsi: next task */ -SYM_CODE_START(__switch_to_asm) - UNWIND_HINT_FUNC +.pushsection .text, "ax" +SYM_FUNC_START(__switch_to_asm) /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -321,7 +261,8 @@ SYM_CODE_START(__switch_to_asm) popq %rbp jmp __switch_to -SYM_CODE_END(__switch_to_asm) +SYM_FUNC_END(__switch_to_asm) +.popsection /* * A newly forked process directly context switches into this address. @@ -330,6 +271,7 @@ SYM_CODE_END(__switch_to_asm) * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) UNWIND_HINT_EMPTY movq %rax, %rdi @@ -342,14 +284,13 @@ SYM_CODE_START(ret_from_fork) UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -358,34 +299,7 @@ SYM_CODE_START(ret_from_fork) movq $0, RAX(%rsp) jmp 2b SYM_CODE_END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_interrupt - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(irq_entries_start) - - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_spurious - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(spurious_entries_start) +.popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY @@ -399,228 +313,185 @@ SYM_CODE_END(spurious_entries_start) #endif .endm -/* - * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers - * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. - * Requires kernel GSBASE. - * - * The invariant is that, if irq_count != -1, then the IRQ stack is in use. +/** + * idtentry_body - Macro to emit code calling the C function + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack */ -.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 - DEBUG_ENTRY_ASSERT_IRQS_OFF - - .if \save_ret - /* - * If save_ret is set, the original stack contains one additional - * entry -- the return address. Therefore, move the address one - * entry below %rsp to \old_rsp. - */ - leaq 8(%rsp), \old_rsp - .else - movq %rsp, \old_rsp - .endif +.macro idtentry_body cfunc has_error_code:req - .if \regs - UNWIND_HINT_REGS base=\old_rsp - .endif + call error_entry + UNWIND_HINT_REGS - incl PER_CPU_VAR(irq_count) - jnz .Lirq_stack_push_old_rsp_\@ + movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ - /* - * Right now, if we just incremented irq_count to zero, we've - * claimed the IRQ stack but we haven't switched to it yet. - * - * If anything is added that can interrupt us here without using IST, - * it must be *extremely* careful to limit its stack usage. This - * could include kprobes and a hypothetical future IST-less #DB - * handler. - * - * The OOPS unwinder relies on the word at the top of the IRQ - * stack linking back to the previous RSP for the entire time we're - * on the IRQ stack. For this to work reliably, we need to write - * it before we actually move ourselves to the IRQ stack. - */ + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .endif - movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(hardirq_stack_ptr), %rsp + call \cfunc -#ifdef CONFIG_DEBUG_ENTRY - /* - * If the first movq above becomes wrong due to IRQ stack layout - * changes, the only way we'll notice is if we try to unwind right - * here. Assert that we set up the stack right to catch this type - * of bug quickly. - */ - cmpq -8(%rsp), \old_rsp - je .Lirq_stack_okay\@ - ud2 - .Lirq_stack_okay\@: -#endif + jmp error_return +.endm -.Lirq_stack_push_old_rsp_\@: - pushq \old_rsp +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + * + * The macro emits code to set up the kernel context for straight forward + * and simple IDT entries. No IST stack, no paranoid entry checks. + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + ASM_CLAC - .if \regs - UNWIND_HINT_REGS indirect=1 + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif - .if \save_ret - /* - * Push the return address to the stack. This return address can - * be found at the "real" original RSP, which was offset by 8 at - * the beginning of this macro. - */ - pushq -8(\old_rsp) + .if \vector == X86_TRAP_BP + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: .endif + + idtentry_body \cfunc \has_error_code + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) .endm /* - * Undoes ENTER_IRQ_STACK. + * Interrupt entry/exit. + * + + The interrupt stubs push (vector) onto the stack, which is the error_code + * position of idtentry exceptions, and jump to one of the two idtentry points + * (common/spurious). + * + * common_interrupt is a hotpath, align it to a cache line */ -.macro LEAVE_IRQ_STACK regs=1 - DEBUG_ENTRY_ASSERT_IRQS_OFF - /* We need to be off the IRQ stack before decrementing irq_count. */ - popq %rsp - - .if \regs - UNWIND_HINT_REGS - .endif - - /* - * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming - * the irq stack but we're not on it. - */ - - decl PER_CPU_VAR(irq_count) +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT + idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm /* - * Interrupt entry helper function. + * System vectors which invoke their handlers directly and are not + * going through the regular common device interrupt handling code. + */ +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + +/** + * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #MC and #DB + * + * If the entry comes from user space it uses the normal entry path + * including the return to user space work and preemption checks on + * exit. * - * Entry runs with interrupts off. Stack layout at entry: - * +----------------------------------------------------+ - * | regs->ss | - * | regs->rsp | - * | regs->eflags | - * | regs->cs | - * | regs->ip | - * +----------------------------------------------------+ - * | regs->orig_ax = ~(interrupt number) | - * +----------------------------------------------------+ - * | return address | - * +----------------------------------------------------+ + * If hits in kernel mode then it needs to go through the paranoid + * entry as the exception can hit any random state. No preemption + * check on exit to keep the paranoid path simple. */ -SYM_CODE_START(interrupt_entry) - UNWIND_HINT_FUNC +.macro idtentry_mce_db vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS ASM_CLAC - cld - testb $3, CS-ORIG_RAX+8(%rsp) - jz 1f - SWAPGS - FENCE_SWAPGS_USER_ENTRY + pushq $-1 /* ORIG_RAX: no syscall to restart */ + /* - * Switch to the thread stack. The IRET frame and orig_ax are - * on the stack, as well as the return address. RDI..R12 are - * not (yet) on the stack and space has not (yet) been - * allocated for them. + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. */ - pushq %rdi + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - movq %rsp, %rdi - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry - /* - * We have RDI, return address, and orig_ax on the stack on - * top of the IRET frame. That means offset=24 - */ - UNWIND_HINT_IRET_REGS base=%rdi offset=24 - - pushq 7*8(%rdi) /* regs->ss */ - pushq 6*8(%rdi) /* regs->rsp */ - pushq 5*8(%rdi) /* regs->eflags */ - pushq 4*8(%rdi) /* regs->cs */ - pushq 3*8(%rdi) /* regs->ip */ - pushq 2*8(%rdi) /* regs->orig_ax */ - pushq 8(%rdi) /* return address */ - UNWIND_HINT_FUNC + UNWIND_HINT_REGS - movq (%rdi), %rdi - jmp 2f -1: - FENCE_SWAPGS_KERNEL_ENTRY -2: - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 + movq %rsp, %rdi /* pt_regs pointer */ - testb $3, CS+8(%rsp) - jz 1f + call \cfunc - /* - * IRQ from user mode. - * - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF is idempotent, - * the simplest way to handle it is to just call it twice if - * we enter from user mode. There's no reason to optimize this since - * TRACE_IRQS_OFF is a no-op if lockdep is off. - */ - TRACE_IRQS_OFF + jmp paranoid_exit - CALL_enter_from_user_mode + /* Switch to the regular task stack and use the noist entry point */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body noist_\cfunc, has_error_code=0 -1: - ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm - ret -SYM_CODE_END(interrupt_entry) -_ASM_NOKPROBE(interrupt_entry) +/* + * Double fault entry. Straight paranoid. No checks from which context + * this comes because for the espfix induced #DF this would do the wrong + * thing. + */ +.macro idtentry_df vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=8 + ASM_CLAC + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + UNWIND_HINT_REGS + movq %rsp, %rdi /* pt_regs pointer into first argument */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + call \cfunc -/* Interrupt entry/exit. */ + jmp paranoid_exit + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_spurious/interrupt. + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. */ -SYM_CODE_START_LOCAL(common_spurious) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call smp_spurious_interrupt /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(common_spurious) -_ASM_NOKPROBE(common_spurious) - -/* common_interrupt is a hotpath. Align it */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call do_IRQ /* rdi points to pt_regs */ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - - LEAVE_IRQ_STACK + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: - testb $3, CS(%rsp) - jz retint_kernel +#include <asm/idtentry.h> - /* Interrupt came from user space */ -.Lretint_user: - mov %rsp,%rdi - call prepare_exit_to_usermode - TRACE_IRQS_ON + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: +SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ @@ -637,6 +508,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -662,23 +534,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) INTERRUPT_RETURN -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPTION - /* Interrupts are off */ - /* Check if we need preemption */ - btl $9, EFLAGS(%rsp) /* were interrupts off? */ - jnc 1f - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ @@ -710,7 +565,7 @@ SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) /* * This may fault. Non-paranoid faults on return to userspace are * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. + * Double-faults due to espfix64 are handled in exc_double_fault. * Other faults here are fatal. */ iretq @@ -788,280 +643,32 @@ native_irq_return_ldt: */ jmp native_irq_return_iret #endif -SYM_CODE_END(common_interrupt) -_ASM_NOKPROBE(common_interrupt) - -/* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS - pushq $~(\num) -.Lcommon_\sym: - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call \do_sym /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(\sym) -_ASM_NOKPROBE(\sym) -.endm - -/* Make sure APIC interrupt handlers end up in the irqentry section: */ -#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -#define POP_SECTION_IRQENTRY .popsection - -.macro apicinterrupt num sym do_sym -PUSH_SECTION_IRQENTRY -apicinterrupt3 \num \sym \do_sym -POP_SECTION_IRQENTRY -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt -#endif - -apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi -apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt -#endif +SYM_CODE_END(common_interrupt_return) +_ASM_NOKPROBE(common_interrupt_return) /* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) - -.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 - - .if \paranoid - call paranoid_entry - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - .else - call error_entry - .endif - UNWIND_HINT_REGS - - .if \read_cr2 - /* - * Store CR2 early so subsequent faults cannot clobber it. Use R12 as - * intermediate storage as RDX can be clobbered in enter_from_user_mode(). - * GET_CR2_INTO can clobber RAX. - */ - GET_CR2_INTO(%r12); - .endif - - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - - .if \paranoid == 0 - testb $3, CS(%rsp) - jz .Lfrom_kernel_no_context_tracking_\@ - CALL_enter_from_user_mode -.Lfrom_kernel_no_context_tracking_\@: - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \read_cr2 - movq %r12, %rdx /* Move CR2 into 3rd argument */ - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \paranoid - /* this procedure expect "no swapgs" flag in ebx */ - jmp paranoid_exit - .else - jmp error_exit - .endif - -.endm - -/** - * idtentry - Generate an IDT entry stub - * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from - * kernel mode with user GSBASE and/or user CR3. - * 2 is special -- see below. - * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a - * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) - * @create_gap: create a 6-word stack gap when coming from kernel mode. - * @read_cr2: load CR2 into the 3rd argument; done before calling any C code - * - * idtentry generates an IDT stub that sets up a usable kernel context, - * creates struct pt_regs, and calls @do_sym. The stub has the following - * special behaviors: + * Reload gs selector with exception handling + * edi: new selector * - * On an entry from user mode, the stub switches from the trampoline or - * IST stack to the normal thread stack. On an exit to user mode, the - * normal exit-to-usermode path is invoked. - * - * On an exit to kernel mode, if @paranoid == 0, we check for preemption, - * whereas we omit the preemption check if @paranoid != 0. This is purely - * because the implementation is simpler this way. The kernel only needs - * to check for asynchronous kernel preemption when IRQ handlers return. - * - * If @paranoid == 0, then the stub will handle IRET faults by pretending - * that the fault came from user mode. It will handle gs_change faults by - * pretending that the fault happened with kernel GSBASE. Since this handling - * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have - * @paranoid == 0. This special handling will do the wrong thing for - * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. - * - * @paranoid == 2 is special: the stub will never switch stacks. This is for - * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + * Is in entry.text as it shouldn't be instrumented. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 - - /* Sanity check */ - .if \shift_ist != -1 && \paranoid != 1 - .error "using shift_ist requires paranoid=1" - .endif - - .if \create_gap && \paranoid - .error "using create_gap requires paranoid=0" - .endif - - ASM_CLAC - - .if \has_error_code == 0 - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - .if \paranoid == 1 - testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ - jnz .Lfrom_usermode_switch_stack_\@ - .endif - - .if \create_gap == 1 - /* - * If coming from kernel space, create a 6-word gap to allow the - * int3 handler to emulate a call instruction. - */ - testb $3, CS-ORIG_RAX(%rsp) - jnz .Lfrom_usermode_no_gap_\@ - .rept 6 - pushq 5*8(%rsp) - .endr - UNWIND_HINT_IRET_REGS offset=8 -.Lfrom_usermode_no_gap_\@: - .endif - - idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset - - .if \paranoid == 1 - /* - * Entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -.Lfrom_usermode_switch_stack_\@: - idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 - .endif - -_ASM_NOKPROBE(\sym) -SYM_CODE_END(\sym) -.endm - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* - * Reload gs selector with exception handling - * edi: new selector - */ -SYM_FUNC_START(native_load_gs_index) +SYM_FUNC_START(asm_load_gs_index) FRAME_BEGIN - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - TRACE_IRQS_OFF - SWAPGS + swapgs .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE - SWAPGS - TRACE_IRQS_FLAGS (%rsp) - popfq + swapgs FRAME_END ret -SYM_FUNC_END(native_load_gs_index) -EXPORT_SYMBOL(native_load_gs_index) +SYM_FUNC_END(asm_load_gs_index) +EXPORT_SYMBOL(asm_load_gs_index) _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) - SWAPGS /* switch back to user gs */ + swapgs /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ movl $__USER_DS, %eax @@ -1074,20 +681,46 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) SYM_CODE_END(.Lbad_gs) .previous -/* Call softirq on interrupt stack. Interrupts are off. */ -SYM_FUNC_START(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp - ENTER_IRQ_STACK regs=0 old_rsp=%r11 - call __do_softirq - LEAVE_IRQ_STACK regs=0 +/* + * rdi: New stack pointer points to the top word of the stack + * rsi: Function pointer + * rdx: Function argument (can be NULL if none) + */ +SYM_FUNC_START(asm_call_on_stack) + /* + * Save the frame pointer unconditionally. This allows the ORC + * unwinder to handle the stack switch. + */ + pushq %rbp + mov %rsp, %rbp + + /* + * The unwinder relies on the word at the top of the new stack + * page linking back to the previous RSP. + */ + mov %rsp, (%rdi) + mov %rdi, %rsp + /* Move the argument to the right place */ + mov %rdx, %rdi + +1: + .pushsection .discard.instr_begin + .long 1b - . + .popsection + + CALL_NOSPEC rsi + +2: + .pushsection .discard.instr_end + .long 2b - . + .popsection + + /* Restore the previous stack pointer from RBP. */ leaveq ret -SYM_FUNC_END(do_softirq_own_stack) +SYM_FUNC_END(asm_call_on_stack) #ifdef CONFIG_XEN_PV -idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - /* * A note on the "critical region" in our callback handler. * We want to avoid stacking callback handlers due to events occurring @@ -1100,9 +733,10 @@ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * So, on entry to the handler we detect whether we interrupted an * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. + * + * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -/* do_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) +SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -1112,15 +746,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) movq %rdi, %rsp /* we don't return, adjust the stack frame */ UNWIND_HINT_REGS - ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall - LEAVE_IRQ_STACK + call xen_pv_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp error_exit -SYM_CODE_END(xen_do_hypervisor_callback) + jmp error_return +SYM_CODE_END(exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1155,7 +784,7 @@ SYM_CODE_START(xen_failsafe_callback) addq $0x30, %rsp pushq $0 /* RIP */ UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection + jmp asm_exc_general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 @@ -1164,52 +793,10 @@ SYM_CODE_START(xen_failsafe_callback) pushq $-1 /* orig_ax = -1 => not a system call */ PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER - jmp error_exit + jmp error_return SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ -#ifdef CONFIG_XEN_PVHVM -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall -#endif - - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler - -apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ - hyperv_reenlightenment_vector hyperv_reenlightenment_intr - -apicinterrupt3 HYPERV_STIMER0_VECTOR \ - hv_stimer0_callback_vector hv_stimer0_vector_handler -#endif /* CONFIG_HYPERV */ - -#if IS_ENABLED(CONFIG_ACRN_GUEST) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - acrn_hv_callback_vector acrn_hv_vector_handler -#endif - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET -idtentry int3 do_int3 has_error_code=0 create_gap=1 -idtentry stack_segment do_stack_segment has_error_code=1 - -#ifdef CONFIG_XEN_PV -idtentry xennmi do_nmi has_error_code=0 -idtentry xendebug do_debug has_error_code=0 -#endif - -idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 - -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 -#endif - -#ifdef CONFIG_X86_MCE -idtentry machine_check do_mce has_error_code=0 paranoid=1 -#endif - /* * Save all registers in pt_regs, and switch gs if needed. * Use slow, but surefire "are we in kernel?" check. @@ -1265,17 +852,13 @@ SYM_CODE_END(paranoid_entry) */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp restore_regs_and_return_to_kernel .Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 jmp restore_regs_and_return_to_kernel @@ -1339,7 +922,6 @@ SYM_CODE_START_LOCAL(error_entry) */ SWAPGS FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1366,14 +948,13 @@ SYM_CODE_START_LOCAL(error_entry) jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) -SYM_CODE_START_LOCAL(error_exit) +SYM_CODE_START_LOCAL(error_return) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF + DEBUG_ENTRY_ASSERT_IRQS_OFF testb $3, CS(%rsp) - jz retint_kernel - jmp .Lretint_user -SYM_CODE_END(error_exit) + jz restore_regs_and_return_to_kernel + jmp swapgs_restore_regs_and_return_to_usermode +SYM_CODE_END(error_return) /* * Runs on exception stack. Xen PV does not go through this path at all, @@ -1383,7 +964,7 @@ SYM_CODE_END(error_exit) * %r14: Used to save/restore the CR3 of the interrupted context * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_REGS /* @@ -1468,7 +1049,7 @@ SYM_CODE_START(nmi) movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* * Return back to user mode. We must *not* do the normal exit @@ -1525,7 +1106,7 @@ SYM_CODE_START(nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call do_nmi anyway, so we can just + * about to about to call exc_nmi() anyway, so we can just * resume the outer NMI. */ @@ -1644,7 +1225,7 @@ repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. "NMI executing" + * gsbase if needed before we call exc_nmi(). "NMI executing" * is zero. */ movq $1, 10*8(%rsp) /* Set "NMI executing". */ @@ -1678,10 +1259,9 @@ end_repeat_nmi: call paranoid_entry UNWIND_HINT_REGS - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 @@ -1718,7 +1298,7 @@ nmi_restore: * about espfix64 on the way back to kernel mode. */ iretq -SYM_CODE_END(nmi) +SYM_CODE_END(asm_exc_nmi) #ifndef CONFIG_IA32_EMULATION /* @@ -1732,6 +1312,7 @@ SYM_CODE_START(ignore_sysret) SYM_CODE_END(ignore_sysret) #endif +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ @@ -1739,7 +1320,8 @@ SYM_CODE_START(rewind_stack_do_exit) movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp - UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE + UNWIND_HINT_REGS call do_exit SYM_CODE_END(rewind_stack_do_exit) +.popsection diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index f1d3ccae5dd5..0f974ae01e62 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -46,12 +46,14 @@ * ebp user stack * 0(%ebp) arg6 */ -SYM_FUNC_START(entry_SYSENTER_compat) +SYM_CODE_START(entry_SYSENTER_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ SWAPGS - /* We are about to clobber %rsp anyway, clobbering here is OK */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + pushq %rax + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + popq %rax movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -104,6 +106,9 @@ SYM_FUNC_START(entry_SYSENTER_compat) xorl %r14d, %r14d /* nospec r14 */ pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ + + UNWIND_HINT_REGS + cld /* @@ -129,17 +134,11 @@ SYM_FUNC_START(entry_SYSENTER_compat) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -147,7 +146,7 @@ SYM_FUNC_START(entry_SYSENTER_compat) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) -SYM_FUNC_END(entry_SYSENTER_compat) +SYM_CODE_END(entry_SYSENTER_compat) /* * 32-bit SYSCALL entry. @@ -197,6 +196,7 @@ SYM_FUNC_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ swapgs @@ -247,17 +247,13 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF + UNWIND_HINT_REGS movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: @@ -266,7 +262,7 @@ sysret32_from_system_call: * stack. So let's erase the thread stack right now. */ STACKLEAK_ERASE - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ @@ -340,6 +336,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. */ @@ -361,8 +358,11 @@ SYM_CODE_START(entry_INT80_compat) /* Need to switch before accessing the thread stack. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + + movq %rsp, %rdi movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq 6*8(%rdi) /* regs->ss */ @@ -401,19 +401,12 @@ SYM_CODE_START(entry_INT80_compat) xorl %r14d, %r14d /* nospec r14 */ pushq %r15 /* pt_regs->r15 */ xorl %r15d, %r15d /* nospec r15 */ - cld - /* - * User mode is traced as though IRQs are on, and the interrupt - * gate turned them off. - */ - TRACE_IRQS_OFF + UNWIND_HINT_REGS + + cld movq %rsp, %rdi call do_int80_syscall_32 -.Lsyscall_32_done: - - /* Go back to user mode. */ - TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..d8f8a1a69ed1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -442,3 +442,4 @@ 435 i386 clone3 sys_clone3 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..78847b32e137 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -359,6 +359,7 @@ 435 common clone3 sys_clone3 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index dbe4493b534e..ccd32877a3c4 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -3,7 +3,6 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. */ #include <linux/linkage.h> #include "calling.h" @@ -37,15 +36,6 @@ SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - THUNK lockdep_sys_exit_thunk,lockdep_sys_exit -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace @@ -53,9 +43,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) #endif -#if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPTION) +#ifdef CONFIG_PREEMPTION SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %r11 popq %r10 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 433a1259f61d..54e03ab26ff3 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -24,6 +24,8 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o +vobjs32-y += vdso32/vclock_gettime.o # files to link into kernel obj-y += vma.o @@ -37,10 +39,12 @@ vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) +vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) +targets += vdso32/vdso32.lds $(vobjs32-y) # Build the vDSO image C files and link them in. vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) @@ -130,10 +134,6 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 -targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o -targets += vdso32/vclock_gettime.o - KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 @@ -158,12 +158,7 @@ endif $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(obj)/vdso32.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/system_call.o \ - $(obj)/vdso32/sigreturn.o +$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE $(call if_changed,vdso_and_check) # diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 3842873b3ae3..7380908045c7 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -187,7 +187,7 @@ static void map_input(const char *name, void **addr, size_t *len, int prot) int fd = open(name, O_RDONLY); if (fd == -1) - err(1, "%s", name); + err(1, "open(%s)", name); tmp_len = lseek(fd, 0, SEEK_END); if (tmp_len == (off_t)-1) @@ -240,7 +240,7 @@ int main(int argc, char **argv) outfilename = argv[3]; outfile = fopen(outfilename, "w"); if (!outfile) - err(1, "%s", argv[2]); + err(1, "fopen(%s)", outfilename); go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index a20b134de2a8..6f46e11ce539 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -13,8 +13,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, unsigned long load_size = -1; /* Work around bogus warning */ unsigned long mapping_size; ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; - int i; - unsigned long j; + unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; @@ -86,11 +85,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); /* Walk the symbol table */ - for (i = 0; - i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); - i++) { - int k; + for (i = 0; i < syms_nr; i++) { + unsigned int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; const char *sym_name = raw_addr + @@ -150,11 +148,11 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); - for (j = 0; j < stripped_len; j++) { - if (j % 10 == 0) + for (i = 0; i < stripped_len; i++) { + if (i % 10 == 0) fprintf(outfile, "\n\t"); fprintf(outfile, "0x%02X, ", - (int)((unsigned char *)stripped_addr)[j]); + (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 43428cc514c8..ea7c1f0b79df 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -144,7 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -154,7 +154,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) zap_page_range(vma, vma->vm_start, size); } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return 0; } #else @@ -268,7 +268,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) unsigned long text_start; int ret = 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; addr = get_unmapped_area(NULL, addr, @@ -311,7 +311,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } up_fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -373,7 +373,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_speciall_mapping, which may @@ -384,11 +384,11 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return -EEXIST; } } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return map_vdso(image, addr); } diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 9a7a1446cb3a..4a809c6cbd2f 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -10,11 +10,11 @@ config PERF_EVENTS_INTEL_UNCORE available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL - tristate "Intel rapl performance events" - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + tristate "Intel/AMD rapl performance events" + depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y ---help--- - Include support for Intel rapl performance events for power + Include support for Intel and AMD rapl performance events for power monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9e07f554333f..12c42eba77ec 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o +obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ +obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/ +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..4103665c6e03 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + err = zhaoxin_pmu_init(); + break; default: err = -ENOTSUPP; } @@ -2162,11 +2166,6 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } -static void refresh_pce(void *ignored) -{ - load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm)); -} - static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) @@ -2179,13 +2178,13 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * - * For now, this can't happen because all callers hold mmap_sem + * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) @@ -2195,7 +2194,7 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) @@ -2253,7 +2252,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev, else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); - on_each_cpu(refresh_pce, NULL, 1); + on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3468b0c1dc7c..e67a5886336c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -2,8 +2,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o -intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6a3b599ee0fe..731dd8d0dbb1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[0]; + struct bts_phys buf[]; }; static struct pmu bts_pmu; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 332954cccece..ca35c8b5ee10 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1892,8 +1892,8 @@ static __initconst const u64 tnt_hw_cache_extra_regs static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1), EVENT_EXTRA_END }; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e4aa20c0426f..442e1ed4acd4 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -643,6 +643,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1db7a51d9792..e94af4a54d0d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -226,8 +226,6 @@ static int __init pt_pmu_hw_init(void) pt_pmu.vmx = true; } - attrs = NULL; - for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..b469ddd45515 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -130,7 +130,7 @@ struct intel_uncore_box { struct list_head list; struct list_head active_list; void __iomem *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; + struct intel_uncore_extra_reg shared_regs[]; }; /* CFL uncore 8th cbox MSRs */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f1cd1ca1a77b..e17a3d8a47ed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -618,6 +618,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, + enabled_ack :1, counter_freezing :1; /* * sysfs attrs @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void) return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ + +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) +int zhaoxin_pmu_init(void); +#else +static inline int zhaoxin_pmu_init(void) +{ + return 0; +} +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/ diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index c2ede2f3b277..136a1e847254 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i) return 0; } +/* + * Accepts msr[] array with non populated entries as long as either + * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs + * visibility is visible when group->is_visible callback is set. + */ unsigned long perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) { @@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + /* skip entry with no group */ + if (!grp) + continue; + grp->is_visible = not_visible; + /* skip unpopulated entry */ + if (!msr[bit].msr) + continue; + if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/rapl.c index a5dbd25852cb..0f2bf59f4354 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/rapl.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Support Intel RAPL energy consumption counters + * Support Intel/AMD RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b * section 14.7.1 (September 2013) * + * AMD RAPL interface for Fam17h is described in the public PPR: + * https://bugzilla.kernel.org/show_bug.cgi?id=206537 + * * RAPL provides more controls than just reporting energy consumption * however here we only expose the 3 energy consumption free running * counters (pp0, pkg, dram). @@ -58,8 +61,8 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#include "../perf_event.h" -#include "../probe.h" +#include "perf_event.h" +#include "probe.h" MODULE_LICENSE("GPL"); @@ -128,7 +131,9 @@ struct rapl_pmus { }; struct rapl_model { + struct perf_msr *rapl_msrs; unsigned long events; + unsigned int msr_power_unit; bool apply_quirk; }; @@ -138,7 +143,7 @@ static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; +static struct perf_msr *rapl_msrs; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { @@ -455,9 +460,16 @@ static struct attribute *rapl_events_cores[] = { NULL, }; +static umode_t +rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -470,6 +482,7 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -482,6 +495,7 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -494,6 +508,7 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -506,6 +521,7 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, + .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) @@ -513,7 +529,7 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } -static struct perf_msr rapl_msrs[] = { +static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, @@ -521,6 +537,16 @@ static struct perf_msr rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, }; +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +}; + + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -575,13 +601,13 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_check_hw_unit(bool apply_quirk) +static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; @@ -592,7 +618,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ - if (apply_quirk) + if (rm->apply_quirk) rapl_hw_unit[PERF_RAPL_RAM] = 16; /* @@ -673,6 +699,8 @@ static struct rapl_model model_snb = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { @@ -680,6 +708,8 @@ static struct rapl_model model_snbep = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { @@ -688,6 +718,8 @@ static struct rapl_model model_hsw = { BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { @@ -695,12 +727,16 @@ static struct rapl_model model_hsx = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { @@ -710,6 +746,15 @@ static struct rapl_model model_skl = { BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, +}; + +static struct rapl_model model_amd_fam17h = { + .events = BIT(PERF_RAPL_PKG), + .apply_quirk = false, + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -738,8 +783,11 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); @@ -755,10 +803,13 @@ static int __init rapl_pmu_init(void) return -ENODEV; rm = (struct rapl_model *) id->driver_data; + + rapl_msrs = rm->rapl_msrs; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events); - ret = rapl_check_hw_unit(rm->apply_quirk); + ret = rapl_check_hw_unit(rm); if (ret) return ret; diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile new file mode 100644 index 000000000000..642c1174d662 --- /dev/null +++ b/arch/x86/events/zhaoxin/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core.o diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c new file mode 100644 index 000000000000..898fa1ae9ceb --- /dev/null +++ b/arch/x86/events/zhaoxin/core.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/nmi.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "../perf_event.h" + +/* + * Zhaoxin PerfMon, used on zxc and later. + */ +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { + + [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, + [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, +}; + +static struct event_constraint zxc_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint zxd_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ + EVENT_CONSTRAINT_END +}; + +static __initconst const u64 zxd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x0538, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0700, + [C(RESULT_MISS)] = 0x0709, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static __initconst const u64 zxe_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x054b, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0028, + [C(RESULT_MISS)] = 0x0029, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static void zhaoxin_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void zhaoxin_pmu_enable_all(int added) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static inline u64 zhaoxin_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void zhaoxin_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void zxc_pmu_ack_status(u64 ack) +{ + /* + * ZXC needs global control enabled in order to clear status bits. + */ + zhaoxin_pmu_enable_all(0); + zhaoxin_pmu_ack_status(ack); + zhaoxin_pmu_disable_all(); +} + +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + u64 status; + int bit; + + cpuc = this_cpu_ptr(&cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); + zhaoxin_pmu_disable_all(); + status = zhaoxin_pmu_get_status(); + if (!status) + goto done; + +again: + if (x86_pmu.enabled_ack) + zxc_pmu_ack_status(status); + else + zhaoxin_pmu_ack_status(status); + + inc_irq_stat(apic_perf_irqs); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + x86_perf_event_update(event); + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = zhaoxin_pmu_get_status(); + if (status) + goto again; + +done: + zhaoxin_pmu_enable_all(0); + return handled; +} + +static u64 zhaoxin_pmu_event_map(int hw_event) +{ + return zx_pmon_event_map[hw_event]; +} + +static struct event_constraint * +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + +PMU_FORMAT_ATTR(event, "config:0-7"); +PMU_FORMAT_ATTR(umask, "config:8-15"); +PMU_FORMAT_ATTR(edge, "config:18"); +PMU_FORMAT_ATTR(inv, "config:23"); +PMU_FORMAT_ATTR(cmask, "config:24-31"); + +static struct attribute *zx_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static const struct x86_pmu zhaoxin_pmu __initconst = { + .name = "zhaoxin", + .handle_irq = zhaoxin_pmu_handle_irq, + .disable_all = zhaoxin_pmu_disable_all, + .enable_all = zhaoxin_pmu_enable_all, + .enable = zhaoxin_pmu_enable_event, + .disable = zhaoxin_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = zhaoxin_pmu_event_map, + .max_events = ARRAY_SIZE(zx_pmon_event_map), + .apic = 1, + /* + * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. + */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = zhaoxin_get_event_constraints, + + .format_attrs = zx_arch_formats_attr, + .events_sysfs_show = zhaoxin_event_sysfs_show, +}; + +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void zhaoxin_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { + zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + zx_arch_events_map[bit].name); + } +} + +__init int zhaoxin_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + pr_info("Welcome to zhaoxin pmu!\n"); + + /* + * Check whether the Architectural PerfMon supports + * hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) + return -ENODEV; + + version = eax.split.version_id; + if (version != 2) + return -ENODEV; + + x86_pmu = zhaoxin_pmu; + pr_info("Version check pass!\n"); + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_add_quirk(zhaoxin_arch_events_quirk); + + switch (boot_cpu_data.x86) { + case 0x06: + if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) { + + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; + + /* Clearing status works only if the global control is enable on zxc. */ + x86_pmu.enabled_ack = 1; + + x86_pmu.event_constraints = zxc_event_constraints; + zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; + + pr_cont("ZXC events, "); + break; + } + return -ENODEV; + + case 0x07: + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); + + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); + + switch (boot_cpu_data.x86_model) { + case 0x1b: + memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; + + pr_cont("ZXD events, "); + break; + case 0x3b: + memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; + + pr_cont("ZXE events, "); + break; + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; + x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + return 0; +} + diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index b0da5320bcff..a54c6a401581 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -15,11 +15,13 @@ #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> +#include <asm/idtentry.h> #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/hyperv.h> #include <linux/slab.h> +#include <linux/kernel.h> #include <linux/cpuhotplug.h> #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> @@ -72,7 +74,8 @@ static int hv_cpu_init(unsigned int cpu) struct page *pg; input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - pg = alloc_page(GFP_KERNEL); + /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ + pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!pg)) return -ENOMEM; *input_arg = page_address(pg); @@ -95,8 +98,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { @@ -151,15 +153,11 @@ static inline bool hv_reenlightenment_available(void) ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; } -__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) { - entering_ack_irq(); - + ack_APIC_irq(); inc_irq_stat(irq_hv_reenlightenment_count); - schedule_delayed_work(&hv_reenlightenment_work, HZ/10); - - exiting_irq(); } void set_hv_tscchange_cb(void (*cb)(void)) @@ -224,10 +222,18 @@ static int hv_cpu_die(unsigned int cpu) rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); if (re_ctrl.target_vp == hv_vp_index[cpu]) { - /* Reassign to some other online CPU */ + /* + * Reassign reenlightenment notifications to some other online + * CPU or just disable the feature if there are no online CPUs + * left (happens on hibernation). + */ new_cpu = cpumask_any_but(cpu_online_mask, cpu); - re_ctrl.target_vp = hv_vp_index[new_cpu]; + if (new_cpu < nr_cpu_ids) + re_ctrl.target_vp = hv_vp_index[new_cpu]; + else + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); } @@ -253,6 +259,7 @@ static int __init hv_pci_init(void) static int hv_suspend(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; /* * Reset the hypercall page as it is going to be invalidated @@ -269,12 +276,17 @@ static int hv_suspend(void) hypercall_msr.enable = 0; wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); - return 0; + ret = hv_cpu_die(0); + return ret; } static void hv_resume(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + ret = hv_cpu_init(0); + WARN_ON(ret); /* Re-enable the hypercall page */ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -285,8 +297,16 @@ static void hv_resume(void) hv_hypercall_pg = hv_hypercall_pg_saved; hv_hypercall_pg_saved = NULL; + + /* + * Reenlightenment notifications are disabled by hv_cpu_die(0), + * reenable them here if hv_reenlightenment_cb was previously set. + */ + if (hv_reenlightenment_cb) + set_hv_tscchange_cb(hv_reenlightenment_cb); } +/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ static struct syscore_ops hv_syscore_ops = { .suspend = hv_suspend, .resume = hv_resume, @@ -355,7 +375,7 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; @@ -419,11 +439,14 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -void hyperv_report_panic(struct pt_regs *regs, long err) +void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) { static bool panic_reported; u64 guest_id; + if (in_die && !panic_on_oops) + return; + /* * We prefer to report panic on 'die' chain as we have proper * registers to report, but if we miss it (e.g. on BUG()) we need diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 3d21eab7aaed..6efe6cb3768a 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <asm/unistd_32.h> +#include <asm/audit.h> unsigned ia32_dir_class[] = { #include <asm-generic/audit_dir_write.h> diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 9bb71abd66bd..385d3d172ee1 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -131,7 +131,7 @@ static int load_aout_binary(struct linux_binprm *bprm) return -ENOMEM; /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); + retval = begin_new_exec(bprm); if (retval) return retval; @@ -156,8 +156,6 @@ static int load_aout_binary(struct linux_binprm *bprm) if (retval < 0) return retval; - install_exec_creds(bprm); - if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9d8804144d0..81cf22398cd1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -350,7 +350,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); user_access_end(); - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h new file mode 100644 index 000000000000..1b07fb102c4e --- /dev/null +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -0,0 +1,25 @@ +#ifdef CONFIG_64BIT +GEN(rax) +GEN(rbx) +GEN(rcx) +GEN(rdx) +GEN(rsi) +GEN(rdi) +GEN(rbp) +GEN(r8) +GEN(r9) +GEN(r10) +GEN(r11) +GEN(r12) +GEN(r13) +GEN(r14) +GEN(r15) +#else +GEN(eax) +GEN(ebx) +GEN(ecx) +GEN(edx) +GEN(esi) +GEN(edi) +GEN(ebp) +#endif diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h deleted file mode 100644 index 4adb13f08af7..000000000000 --- a/arch/x86/include/asm/acrn.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_ACRN_H -#define _ASM_X86_ACRN_H - -extern void acrn_hv_callback_vector(void); -#ifdef CONFIG_TRACING -#define trace_acrn_hv_callback_vector acrn_hv_callback_vector -#endif - -extern void acrn_hv_vector_handler(struct pt_regs *regs); -#endif /* _ASM_X86_ACRN_H */ diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 8e25bf4f323a..62da760d6d5a 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_AGP_H #define _ASM_X86_AGP_H -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/cacheflush.h> /* diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 99bb207fc04c..87ce8e963215 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -25,11 +25,7 @@ #define APBT_MIN_FREQ 1000000 #define APBT_MMAP_SIZE 1024 -#define APBT_DEV_USED 1 - extern void apbt_time_init(void); -extern unsigned long apbt_quick_calibrate(void); -extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); @@ -38,7 +34,6 @@ extern int sfi_mtimer_num; #else /* CONFIG_APB_TIMER */ -static inline unsigned long apbt_quick_calibrate(void) {return 0; } static inline void apbt_time_init(void) { } #endif diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 19e94af9cc5d..2cc44e957c31 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -519,39 +519,6 @@ static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif -extern void irq_enter(void); -extern void irq_exit(void); - -static inline void entering_irq(void) -{ - irq_enter(); - kvm_set_cpu_l1tf_flush_l1d(); -} - -static inline void entering_ack_irq(void) -{ - entering_irq(); - ack_APIC_irq(); -} - -static inline void ipi_entering_ack_irq(void) -{ - irq_enter(); - ack_APIC_irq(); - kvm_set_cpu_l1tf_flush_l1d(); -} - -static inline void exiting_irq(void) -{ - irq_exit(); -} - -static inline void exiting_ack_irq(void) -{ - ack_APIC_irq(); - irq_exit(); -} - extern void ioapic_zap_locks(void); #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 7a4bb1bd4bdb..ebc248e49549 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -15,16 +15,6 @@ #define RDRAND_RETRY_LOOPS 10 -#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" -#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" -#ifdef CONFIG_X86_64 -# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" -# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" -#else -# define RDRAND_LONG RDRAND_INT -# define RDSEED_LONG RDSEED_INT -#endif - /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) @@ -32,9 +22,9 @@ static inline bool __must_check rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -46,9 +36,9 @@ static inline bool __must_check rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -58,18 +48,18 @@ static inline bool __must_check rdrand_int(unsigned int *v) static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } static inline bool __must_check rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ce92c4acc913..5a42f9206138 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -1,13 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/ftrace.h> #include <linux/uaccess.h> +#include <linux/pgtable.h> #include <asm/string.h> #include <asm/page.h> #include <asm/checksum.h> #include <asm-generic/asm-prototypes.h> -#include <asm/pgtable.h> #include <asm/special_insns.h> #include <asm/preempt.h> #include <asm/asm.h> @@ -17,24 +17,19 @@ extern void cmpxchg8b_emu(void); #endif #ifdef CONFIG_RETPOLINE -#ifdef CONFIG_X86_32 -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); -#else -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); -INDIRECT_THUNK(8) -INDIRECT_THUNK(9) -INDIRECT_THUNK(10) -INDIRECT_THUNK(11) -INDIRECT_THUNK(12) -INDIRECT_THUNK(13) -INDIRECT_THUNK(14) -INDIRECT_THUNK(15) -#endif -INDIRECT_THUNK(ax) -INDIRECT_THUNK(bx) -INDIRECT_THUNK(cx) -INDIRECT_THUNK(dx) -INDIRECT_THUNK(si) -INDIRECT_THUNK(di) -INDIRECT_THUNK(bp) + +#define DECL_INDIRECT_THUNK(reg) \ + extern asmlinkage void __x86_indirect_thunk_ ## reg (void); + +#define DECL_RETPOLINE(reg) \ + extern asmlinkage void __x86_retpoline_ ## reg (void); + +#undef GEN +#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) DECL_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 115127c7ad28..bf35e476a776 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -28,7 +28,7 @@ static __always_inline int arch_atomic_read(const atomic_t *v) * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, * it's non-inlined function that increases binary size and stack usage. */ - return READ_ONCE((v)->counter); + return __READ_ONCE((v)->counter); } /** @@ -40,7 +40,7 @@ static __always_inline int arch_atomic_read(const atomic_t *v) */ static __always_inline void arch_atomic_set(atomic_t *v, int i) { - WRITE_ONCE(v->counter, i); + __WRITE_ONCE(v->counter, i); } /** @@ -166,6 +166,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } +#define arch_atomic_add_return arch_atomic_add_return /** * arch_atomic_sub_return - subtract integer and return @@ -178,34 +179,39 @@ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { return arch_atomic_add_return(-i, v); } +#define arch_atomic_sub_return arch_atomic_sub_return static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } +#define arch_atomic_fetch_add arch_atomic_fetch_add static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { return xadd(&v->counter, -i); } +#define arch_atomic_fetch_sub arch_atomic_fetch_sub static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return arch_cmpxchg(&v->counter, old, new); } +#define arch_atomic_cmpxchg arch_atomic_cmpxchg -#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return try_cmpxchg(&v->counter, old, new); } +#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg -static inline int arch_atomic_xchg(atomic_t *v, int new) +static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } +#define arch_atomic_xchg arch_atomic_xchg -static inline void arch_atomic_and(int i, atomic_t *v) +static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "andl %1,%0" : "+m" (v->counter) @@ -213,7 +219,7 @@ static inline void arch_atomic_and(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_and(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); @@ -221,8 +227,9 @@ static inline int arch_atomic_fetch_and(int i, atomic_t *v) return val; } +#define arch_atomic_fetch_and arch_atomic_fetch_and -static inline void arch_atomic_or(int i, atomic_t *v) +static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "orl %1,%0" : "+m" (v->counter) @@ -230,7 +237,7 @@ static inline void arch_atomic_or(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_or(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); @@ -238,8 +245,9 @@ static inline int arch_atomic_fetch_or(int i, atomic_t *v) return val; } +#define arch_atomic_fetch_or arch_atomic_fetch_or -static inline void arch_atomic_xor(int i, atomic_t *v) +static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "xorl %1,%0" : "+m" (v->counter) @@ -247,7 +255,7 @@ static inline void arch_atomic_xor(int i, atomic_t *v) : "memory"); } -static inline int arch_atomic_fetch_xor(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); @@ -255,6 +263,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) return val; } +#define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> @@ -262,6 +271,6 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) # include <asm/atomic64_64.h> #endif -#include <asm-generic/atomic-instrumented.h> +#define ARCH_ATOMIC #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 52cfaecb13f9..5efd01b548d1 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -75,6 +75,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } +#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg /** * arch_atomic64_xchg - xchg atomic64 variable @@ -94,6 +95,7 @@ static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) : "memory"); return o; } +#define arch_atomic64_xchg arch_atomic64_xchg /** * arch_atomic64_set - set atomic64 variable @@ -138,6 +140,7 @@ static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) ASM_NO_INPUT_CLOBBER("memory")); return i; } +#define arch_atomic64_add_return arch_atomic64_add_return /* * Other variants with different arithmetic operators: @@ -149,6 +152,7 @@ static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) ASM_NO_INPUT_CLOBBER("memory")); return i; } +#define arch_atomic64_sub_return arch_atomic64_sub_return static inline s64 arch_atomic64_inc_return(atomic64_t *v) { @@ -242,6 +246,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) "S" (v) : "memory"); return (int)a; } +#define arch_atomic64_add_unless arch_atomic64_add_unless static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { @@ -281,6 +286,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) return old; } +#define arch_atomic64_fetch_and arch_atomic64_fetch_and static inline void arch_atomic64_or(s64 i, atomic64_t *v) { @@ -299,6 +305,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) return old; } +#define arch_atomic64_fetch_or arch_atomic64_fetch_or static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { @@ -317,6 +324,7 @@ static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) return old; } +#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { @@ -327,6 +335,7 @@ static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) return old; } +#define arch_atomic64_fetch_add arch_atomic64_fetch_add #define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), (v)) diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 95c6ceac66b9..809bd010a751 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -19,7 +19,7 @@ */ static inline s64 arch_atomic64_read(const atomic64_t *v) { - return READ_ONCE((v)->counter); + return __READ_ONCE((v)->counter); } /** @@ -31,7 +31,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v) */ static inline void arch_atomic64_set(atomic64_t *v, s64 i) { - WRITE_ONCE(v->counter, i); + __WRITE_ONCE(v->counter, i); } /** @@ -159,37 +159,43 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } +#define arch_atomic64_add_return arch_atomic64_add_return static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } +#define arch_atomic64_sub_return arch_atomic64_sub_return static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } +#define arch_atomic64_fetch_add arch_atomic64_fetch_add static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } +#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } +#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return try_cmpxchg(&v->counter, old, new); } +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } +#define arch_atomic64_xchg arch_atomic64_xchg static inline void arch_atomic64_and(s64 i, atomic64_t *v) { @@ -207,6 +213,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); return val; } +#define arch_atomic64_fetch_and arch_atomic64_fetch_and static inline void arch_atomic64_or(s64 i, atomic64_t *v) { @@ -224,6 +231,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); return val; } +#define arch_atomic64_fetch_or arch_atomic64_fetch_or static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { @@ -241,5 +249,6 @@ static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); return val; } +#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h new file mode 100644 index 000000000000..36aec57ea7a3 --- /dev/null +++ b/arch/x86/include/asm/audit.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AUDIT_H +#define _ASM_X86_AUDIT_H + +int ia32_classify_syscall(unsigned int syscall); + +#endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 53f246e9df5a..0367efdc5b7a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -52,9 +52,9 @@ static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "orb %1,%0" + asm volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" (CONST_MASK(nr) & 0xff) + : "iq" (CONST_MASK(nr)) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" @@ -72,9 +72,9 @@ static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "andb %1,%0" + asm volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" (CONST_MASK(nr) ^ 0xff)); + : "iq" (~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); @@ -123,9 +123,9 @@ static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "xorb %1,%0" + asm volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" ((u8)CONST_MASK(nr))); + : "iq" (CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index facba9bc30ca..fb34ff641e0a 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -70,14 +70,17 @@ do { \ #define HAVE_ARCH_BUG #define BUG() \ do { \ + instrumentation_begin(); \ _BUG_FLAGS(ASM_UD2, 0); \ unreachable(); \ } while (0) #define __WARN_FLAGS(flags) \ do { \ + instrumentation_begin(); \ _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ annotate_reachable(); \ + instrumentation_end(); \ } while (0) #include <asm-generic/bug.h> diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63feaf2a5f93..b192d917a6d0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H +#include <linux/mm.h> + /* Caches aren't brain-dead on the intel. */ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index d79d1e622dcf..0ada98d5d09f 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +#define HAVE_CSUM_COPY_USER #ifdef CONFIG_X86_32 # include <asm/checksum_32.h> #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index f57b94e02c57..11624c8a9d8d 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -44,18 +44,21 @@ static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } -static inline __wsum csum_partial_copy_from_user(const void __user *src, - void *dst, - int len, __wsum sum, - int *err_ptr) +static inline __wsum csum_and_copy_from_user(const void __user *src, + void *dst, int len, + __wsum sum, int *err_ptr) { __wsum ret; might_sleep(); - stac(); + if (!user_access_begin(src, len)) { + if (len) + *err_ptr = -EFAULT; + return sum; + } ret = csum_partial_copy_generic((__force void *)src, dst, len, sum, err_ptr, NULL); - clac(); + user_access_end(); return ret; } @@ -173,7 +176,6 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, /* * Copy and checksum to user */ -#define HAVE_CSUM_COPY_USER static inline __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, @@ -182,11 +184,10 @@ static inline __wsum csum_and_copy_to_user(const void *src, __wsum ret; might_sleep(); - if (access_ok(dst, len)) { - stac(); + if (user_access_begin(dst, len)) { ret = csum_partial_copy_generic(src, (__force void *)dst, len, sum, NULL, err_ptr); - clac(); + user_access_end(); return ret; } diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 3ec6d3267cf9..0a289b87e872 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -129,27 +129,19 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER 1 - - /* Do not call this directly. Use the wrappers below */ extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp); -extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst, +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); -/* Old names. To be removed. */ -#define csum_and_copy_to_user csum_partial_copy_to_user -#define csum_and_copy_from_user csum_partial_copy_from_user - /** * ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 52e9f3480f69..d4edf281fff4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -214,7 +214,11 @@ static inline bool in_compat_syscall(void) #endif struct compat_siginfo; -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const kernel_siginfo_t *from, bool x32_ABI); + +#ifdef CONFIG_X86_X32_ABI +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#define copy_siginfo_to_user32 copy_siginfo_to_user32 +#endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index cf3d621c6892..eb8fcede9e3b 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -20,12 +20,14 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Base macro for CPU matching + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY * @_data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer @@ -37,16 +39,35 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(_vendor, _family, _model, \ - _feature, _data) { \ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ .vendor = X86_VENDOR_##_vendor, \ .family = _family, \ .model = _model, \ + .steppings = _steppings, \ .feature = _feature, \ .driver_data = (unsigned long) _data \ } /** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ + X86_STEPPING_ANY, feature, data) + +/** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@vendor @@ -139,6 +160,10 @@ #define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) +#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + steppings, X86_FEATURE_ANY, data) + /* * Match specific microcode revisions. * diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 02c0078d3787..8902fdb7de13 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -11,15 +11,11 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ +#define ESTACKS_MEMBERS(guardsize) \ char DF_stack_guard[guardsize]; \ char DF_stack[EXCEPTION_STKSZ]; \ char NMI_stack_guard[guardsize]; \ char NMI_stack[EXCEPTION_STKSZ]; \ - char DB2_stack_guard[guardsize]; \ - char DB2_stack[db2_holesize]; \ - char DB1_stack_guard[guardsize]; \ - char DB1_stack[EXCEPTION_STKSZ]; \ char DB_stack_guard[guardsize]; \ char DB_stack[EXCEPTION_STKSZ]; \ char MCE_stack_guard[guardsize]; \ @@ -28,12 +24,12 @@ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0, 0) + ESTACKS_MEMBERS(0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) + ESTACKS_MEMBERS(PAGE_SIZE) }; /* @@ -42,8 +38,6 @@ struct cea_exception_stacks { enum exception_stack_ordering { ESTACK_DF, ESTACK_NMI, - ESTACK_DB2, - ESTACK_DB1, ESTACK_DB, ESTACK_MCE, N_EXCEPTION_STACKS diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index db189945e9b0..02dabc9e77b0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -362,6 +362,7 @@ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -407,5 +408,6 @@ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 1a8609a15856..e89558a3fe4a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); native_set_debugreg(register, value) #endif -static inline unsigned long native_get_debugreg(int regno) +static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -47,7 +47,7 @@ static inline unsigned long native_get_debugreg(int regno) return val; } -static inline void native_set_debugreg(int regno, unsigned long value) +static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -85,7 +85,7 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } -static inline int hw_breakpoint_active(void) +static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } @@ -94,24 +94,38 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(int, debug_stack_usage); -static inline void debug_stack_usage_inc(void) +static __always_inline unsigned long local_db_save(void) { - __this_cpu_inc(debug_stack_usage); + unsigned long dr7; + + if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) + return 0; + + get_debugreg(dr7, 7); + dr7 &= ~0x400; /* architecturally set bit */ + if (dr7) + set_debugreg(0, 7); + /* + * Ensure the compiler doesn't lower the above statements into + * the critical section; disabling breakpoints late would not + * be good. + */ + barrier(); + + return dr7; } -static inline void debug_stack_usage_dec(void) + +static __always_inline void local_db_restore(unsigned long dr7) { - __this_cpu_dec(debug_stack_usage); + /* + * Ensure the compiler doesn't raise this statement into + * the critical section; enabling breakpoints early would + * not be good. + */ + barrier(); + if (dr7) + set_debugreg(dr7, 7); } -void debug_stack_set_zero(void); -void debug_stack_reset(void); -#else /* !X86_64 */ -static inline void debug_stack_set_zero(void) { } -static inline void debug_stack_reset(void) { } -static inline void debug_stack_usage_inc(void) { } -static inline void debug_stack_usage_dec(void) { } -#endif /* X86_64 */ #ifdef CONFIG_CPU_SUP_AMD extern void set_dr_addr_mask(unsigned long mask, int dr); diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index de9e7841f953..630891d25819 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -3,8 +3,10 @@ #define _ASM_X86_DELAY_H #include <asm-generic/delay.h> +#include <linux/init.h> -void use_tsc_delay(void); +void __init use_tsc_delay(void); +void __init use_tpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 68a99d2a5f33..1ced11d31932 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -40,11 +40,6 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->l = 0; } -extern struct desc_ptr idt_descr; -extern gate_desc idt_table[]; -extern const struct desc_ptr debug_idt_descr; -extern gate_desc debug_idt_table[]; - struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); @@ -214,7 +209,7 @@ static inline void native_load_gdt(const struct desc_ptr *dtr) asm volatile("lgdt %0"::"m" (*dtr)); } -static inline void native_load_idt(const struct desc_ptr *dtr) +static __always_inline void native_load_idt(const struct desc_ptr *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } @@ -386,64 +381,23 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void update_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long system_vectors[]; -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(u32, debug_idt_ctr); -static inline bool is_debug_idt_enabled(void) -{ - if (this_cpu_read(debug_idt_ctr)) - return true; - - return false; -} - -static inline void load_debug_idt(void) -{ - load_idt((const struct desc_ptr *)&debug_idt_descr); -} -#else -static inline bool is_debug_idt_enabled(void) -{ - return false; -} - -static inline void load_debug_idt(void) -{ -} -#endif - -/* - * The load_current_idt() must be called with interrupts disabled - * to avoid races. That way the IDT will always be set back to the expected - * descriptor. It's also called when a CPU is being initialized, and - * that doesn't need to disable interrupts, as nothing should be - * bothering the CPU then. - */ -static inline void load_current_idt(void) -{ - if (is_debug_idt_enabled()) - load_debug_idt(); - else - load_idt((const struct desc_ptr *)&idt_descr); -} - +extern void load_current_idt(void); extern void idt_setup_early_handler(void); extern void idt_setup_early_traps(void); extern void idt_setup_traps(void); extern void idt_setup_apic_and_irq_gates(void); +extern bool idt_is_f00f_address(unsigned long address); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); extern void idt_setup_ist_traps(void); -extern void idt_setup_debugidt_traps(void); #else static inline void idt_setup_early_pf(void) { } static inline void idt_setup_ist_traps(void) { } -static inline void idt_setup_debugidt_traps(void) { } #endif extern void idt_invalidate(void *addr); diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 7e31f7f1bb06..49bd6cf3eec9 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -3,7 +3,7 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) +#ifdef CONFIG_IOMMU_API void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 00f7cf45e699..8e95aa4b0d17 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -74,7 +74,7 @@ #define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT)) #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h index af9a14ac8962..54a6e4a2e132 100644 --- a/arch/x86/include/asm/doublefault.h +++ b/arch/x86/include/asm/doublefault.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_DOUBLEFAULT_H #define _ASM_X86_DOUBLEFAULT_H -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 extern void doublefault_init_cpu_tss(void); #else static inline void doublefault_init_cpu_tss(void) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index cdcf48d52a12..e7d2ccfdd507 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,12 +3,13 @@ #define _ASM_X86_EFI_H #include <asm/fpu/api.h> -#include <asm/pgtable.h> #include <asm/processor-flags.h> #include <asm/tlb.h> #include <asm/nospec-branch.h> #include <asm/mmu_context.h> #include <linux/build_bug.h> +#include <linux/kernel.h> +#include <linux/pgtable.h> extern unsigned long efi_fw_vendor, efi_config_table; @@ -178,8 +179,10 @@ extern void efi_free_boot_services(void); extern pgd_t * __init efi_uv1_memmap_phys_prolog(void); extern void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd); +/* kexec external ABI */ struct efi_setup_data { u64 fw_vendor; + u64 __unused; u64 tables; u64 smbios; u64 reserved[8]; @@ -223,14 +226,21 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, /* arch specific definitions used by the stub code */ -__attribute_const__ bool efi_is_64bit(void); +#ifdef CONFIG_EFI_MIXED + +#define ARCH_HAS_EFISTUB_WRAPPERS + +static inline bool efi_is_64bit(void) +{ + extern const bool efi_is64; + + return efi_is64; +} static inline bool efi_is_native(void) { if (!IS_ENABLED(CONFIG_X86_64)) return true; - if (!IS_ENABLED(CONFIG_EFI_MIXED)) - return true; return efi_is_64bit(); } @@ -284,6 +294,15 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_allocate_pool(type, size, buffer) \ ((type), (size), efi64_zero_upper(buffer)) +#define __efi64_argmap_create_event(type, tpl, f, c, event) \ + ((type), (tpl), (f), (c), efi64_zero_upper(event)) + +#define __efi64_argmap_set_timer(event, type, time) \ + ((event), (type), lower_32_bits(time), upper_32_bits(time)) + +#define __efi64_argmap_wait_for_event(num, event, index) \ + ((num), (event), efi64_zero_upper(index)) + #define __efi64_argmap_handle_protocol(handle, protocol, interface) \ ((handle), (protocol), efi64_zero_upper(interface)) @@ -305,6 +324,10 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_load_file(protocol, path, policy, bufsize, buf) \ ((protocol), (path), (policy), efi64_zero_upper(bufsize), (buf)) +/* Graphics Output Protocol */ +#define __efi64_argmap_query_mode(gop, mode, size, info) \ + ((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -333,15 +356,26 @@ static inline u32 efi64_convert_status(efi_status_t status) #define efi_bs_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - boottime), func, __VA_ARGS__)) + ? efi_system_table->boottime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + boottime), \ + func, __VA_ARGS__)) #define efi_rt_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - runtime), func, __VA_ARGS__)) + ? efi_system_table->runtime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + runtime), \ + func, __VA_ARGS__)) + +#else /* CONFIG_EFI_MIXED */ + +static inline bool efi_is_64bit(void) +{ + return IS_ENABLED(CONFIG_X86_64); +} + +#endif /* CONFIG_EFI_MIXED */ extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 69c0f892e310..452beed7892b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -281,9 +281,29 @@ extern u32 elf_hwcap2; /* * An executable for which elf_read_implies_exec() returns TRUE will * have the READ_IMPLIES_EXEC personality flag set automatically. + * + * The decision process for determining the results are: + * + *        CPU: | lacks NX*  | has NX, ia32   | has NX, x86_64 | + * ELF:        |       |          |         | + * ---------------------|------------|------------------|----------------| + * missing PT_GNU_STACK | exec-all  | exec-all     | exec-none   | + * PT_GNU_STACK == RWX  | exec-stack | exec-stack    | exec-stack   | + * PT_GNU_STACK == RW  | exec-none  | exec-none     | exec-none    | + * + * exec-all : all PROT_READ user mappings are executable, except when + * backed by files on a noexec-filesystem. + * exec-none : only PROT_EXEC user mappings are executable. + * exec-stack: only the stack and PROT_EXEC user mappings are executable. + * + * *this column has no architectural effect: NX markings are ignored by + * hardware, but may have behavioral effects when "wants X" collides with + * "cannot be X" constraints in memory permission flags, as in + * https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com + * */ #define elf_read_implies_exec(ex, executable_stack) \ - (executable_stack != EXSTACK_DISABLE_X) + (mmap_is_ia32() && executable_stack == EXSTACK_DEFAULT) struct task_struct; diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h deleted file mode 100644 index 416422762845..000000000000 --- a/arch/x86/include/asm/entry_arch.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This file is designed to contain the BUILD_INTERRUPT specifications for - * all of the extra named interrupt vectors used by the architecture. - * Usually this is the Inter Process Interrupts (IPIs) - */ - -/* - * The following vectors are part of the Linux architecture, there - * is no hardware IRQ pin equivalent for them, they are triggered - * through the ICC by us (IPIs) - */ -#ifdef CONFIG_SMP -BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) -BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) -BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) -BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) -#endif - -#ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) -BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) -BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) -#endif - -/* - * every pentium local APIC has two 'local interrupts', with a - * soft-definable vector attached to both interrupts, one of - * which is a timer interrupt, the other one is error counter - * overflow. Linux uses the local APIC timer interrupt to get - * a much simpler SMP time architecture: - */ -#ifdef CONFIG_X86_LOCAL_APIC - -BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) -BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) -BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - -#ifdef CONFIG_IRQ_WORK -BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) -#endif - -#ifdef CONFIG_X86_MCE_AMD -BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) -#endif -#endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 7ec59edde154..d43717b423cb 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -31,8 +31,8 @@ #define CSW fd_routine[can_use_virtual_dma & 1] -#define fd_inb(port) inb_p(port) -#define fd_outb(value, port) outb_p(value, port) +#define fd_inb(base, reg) inb_p((base) + (reg)) +#define fd_outb(value, base, reg) outb_p(value, (base) + (reg)) #define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy") #define fd_free_dma() CSW._free_dma(FLOPPY_DMA) @@ -77,25 +77,26 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) st = 1; for (lcount = virtual_dma_count, lptr = virtual_dma_addr; lcount; lcount--, lptr++) { - st = inb(virtual_dma_port + 4) & 0xa0; - if (st != 0xa0) + st = inb(virtual_dma_port + FD_STATUS); + st &= STATUS_DMA | STATUS_READY; + if (st != (STATUS_DMA | STATUS_READY)) break; if (virtual_dma_mode) - outb_p(*lptr, virtual_dma_port + 5); + outb_p(*lptr, virtual_dma_port + FD_DATA); else - *lptr = inb_p(virtual_dma_port + 5); + *lptr = inb_p(virtual_dma_port + FD_DATA); } virtual_dma_count = lcount; virtual_dma_addr = lptr; - st = inb(virtual_dma_port + 4); + st = inb(virtual_dma_port + FD_STATUS); } #ifdef TRACE_FLPY_INT calls++; #endif - if (st == 0x20) + if (st == STATUS_DMA) return IRQ_HANDLED; - if (!(st & 0x20)) { + if (!(st & STATUS_DMA)) { virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 44c48e34d799..42159f45bf9c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); -extern void fpu__clear(struct fpu *fpu); +extern void fpu__clear_user_states(struct fpu *fpu); +extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); @@ -92,7 +93,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; } static inline void fpstate_init_fxstate(struct fxregs_state *fx) @@ -399,7 +400,10 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) u32 hmask = mask >> 32; int err; - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + if (static_cpu_has(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c6136d79f8c0..422d8369012a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,19 +21,29 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) -/* Supervisor features */ -#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_FP | \ - XFEATURE_MASK_SSE | \ - XFEATURE_MASK_YMM | \ - XFEATURE_MASK_OPMASK | \ - XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM | \ - XFEATURE_MASK_PKRU | \ - XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) +/* All currently supported user features */ +#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + +/* All currently supported supervisor features */ +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) + +/* + * Unsupported supervisor features. When a supervisor feature in this mask is + * supported in the future, move it to the supported supervisor feature mask. + */ +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) + +/* All supervisor states including supported and unsupported states. */ +#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ + XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " @@ -41,7 +51,18 @@ #define REX_PREFIX #endif -extern u64 xfeatures_mask; +extern u64 xfeatures_mask_all; + +static inline u64 xfeatures_mask_supervisor(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_user(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; +} + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, @@ -54,8 +75,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); +void copy_supervisor_to_kernel(struct xregs_state *xsave); /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -extern int validate_xstate_header(const struct xstate_header *hdr); +int validate_user_xstate_header(const struct xstate_header *hdr); #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 85be2f506272..84b9449be080 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,16 +56,23 @@ struct dyn_arch_ftrace { #ifndef __ASSEMBLY__ +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +extern void set_ftrace_ops_ro(void); +#else +static inline void set_ftrace_ops_ro(void) { } +#endif + #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { /* * Compare the symbol name with the system call name. Skip the - * "__x64_sys", "__ia32_sys" or simple "sys" prefix. + * "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix. */ return !strcmp(sym + 3, name + 3) || (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) || - (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)); + (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) || + (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3)); } #ifndef COMPILE_OFFSETS diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,15 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); -extern void kunmap_high(struct page *page); - -void *kmap(struct page *page); -void kunmap(struct page *page); - -void *kmap_atomic_prot(struct page *page, pgprot_t prot); -void *kmap_atomic(struct page *page); -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index f65cfb48cfdd..1721b1aadeb1 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,14 +7,4 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 4154bc5f6a4e..74c12437401e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,28 +28,6 @@ #include <asm/irq.h> #include <asm/sections.h> -/* Interrupt handlers registered during init_IRQ */ -extern asmlinkage void apic_timer_interrupt(void); -extern asmlinkage void x86_platform_ipi(void); -extern asmlinkage void kvm_posted_intr_ipi(void); -extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); -extern asmlinkage void kvm_posted_intr_nested_ipi(void); -extern asmlinkage void error_interrupt(void); -extern asmlinkage void irq_work_interrupt(void); -extern asmlinkage void uv_bau_message_intr1(void); - -extern asmlinkage void spurious_interrupt(void); -extern asmlinkage void thermal_interrupt(void); -extern asmlinkage void reschedule_interrupt(void); - -extern asmlinkage void irq_move_cleanup_interrupt(void); -extern asmlinkage void reboot_interrupt(void); -extern asmlinkage void threshold_interrupt(void); -extern asmlinkage void deferred_error_interrupt(void); - -extern asmlinkage void call_function_interrupt(void); -extern asmlinkage void call_function_single_interrupt(void); - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 29336574d0bc..7a4d2062385c 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -11,17 +11,6 @@ #include <linux/types.h> #include <asm/page.h> - -/* - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size - * of 4096. These definitions are used when communicating with Hyper-V using - * guest physical pages and guest physical page addresses, since the guest page - * size may not be 4096 on all architectures. - */ -#define HV_HYP_PAGE_SHIFT 12 -#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) -#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) - /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). @@ -39,78 +28,41 @@ #define HYPERV_CPUID_MAX 0x4000ffff /* - * Feature identification. EAX indicates which features are available - * to the partition based upon the current partition privileges. - * These are HYPERV_CPUID_FEATURES.EAX bits. + * Aliases for Group A features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EAX bits. */ -/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) -/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) -/* - * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM - * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available - */ -#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) -/* - * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through - * HV_X64_MSR_STIMER3_COUNT) available - */ -#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) -/* - * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) - * are available - */ -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) -/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ -#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) -/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ -#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) -/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ -#define HV_X64_MSR_RESET_AVAILABLE BIT(7) -/* - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available - */ -#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) -/* Partition reference TSC MSR is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) -/* Partition Guest IDLE MSR is available */ -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) -/* - * There is a single feature flag that signifies if the partition has access - * to MSRs with local APIC and TSC frequencies. - */ -#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) -/* AccessReenlightenmentControls privilege */ -#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) -/* AccessTscInvariantControls privilege */ -#define HV_X64_ACCESS_TSC_INVARIANT BIT(15) +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE \ + HV_MSR_VP_RUNTIME_AVAILABLE +#define HV_X64_MSR_SYNIC_AVAILABLE \ + HV_MSR_SYNIC_AVAILABLE +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE \ + HV_MSR_APIC_ACCESS_AVAILABLE +#define HV_X64_MSR_HYPERCALL_AVAILABLE \ + HV_MSR_HYPERCALL_AVAILABLE +#define HV_X64_MSR_VP_INDEX_AVAILABLE \ + HV_MSR_VP_INDEX_AVAILABLE +#define HV_X64_MSR_RESET_AVAILABLE \ + HV_MSR_RESET_AVAILABLE +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE \ + HV_MSR_GUEST_IDLE_AVAILABLE +#define HV_X64_ACCESS_FREQUENCY_MSRS \ + HV_ACCESS_FREQUENCY_MSRS +#define HV_X64_ACCESS_REENLIGHTENMENT \ + HV_ACCESS_REENLIGHTENMENT +#define HV_X64_ACCESS_TSC_INVARIANT \ + HV_ACCESS_TSC_INVARIANT /* - * Feature identification: indicates which flags were specified at partition - * creation. The format is the same as the partition creation flag structure - * defined in section Partition Creation Flags. - * These are HYPERV_CPUID_FEATURES.EBX bits. + * Aliases for Group B features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EBX bits. */ -#define HV_X64_CREATE_PARTITIONS BIT(0) -#define HV_X64_ACCESS_PARTITION_ID BIT(1) -#define HV_X64_ACCESS_MEMORY_POOL BIT(2) -#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) -#define HV_X64_POST_MESSAGES BIT(4) -#define HV_X64_SIGNAL_EVENTS BIT(5) -#define HV_X64_CREATE_PORT BIT(6) -#define HV_X64_CONNECT_PORT BIT(7) -#define HV_X64_ACCESS_STATS BIT(8) -#define HV_X64_DEBUGGING BIT(11) -#define HV_X64_CPU_POWER_MANAGEMENT BIT(12) +#define HV_X64_POST_MESSAGES HV_POST_MESSAGES +#define HV_X64_SIGNAL_EVENTS HV_SIGNAL_EVENTS /* - * Feature identification. EDX indicates which miscellaneous features - * are available to the partition. - * These are HYPERV_CPUID_FEATURES.EDX bits. + * Group D Features. The bit assignments are custom to each architecture. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. */ /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ #define HV_X64_MWAIT_AVAILABLE BIT(0) @@ -131,6 +83,8 @@ #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +/* Support for debug MSRs available */ +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) /* stimer Direct Mode is available */ #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) @@ -187,7 +141,7 @@ * processor, except for virtual processors that are reported as sibling SMT * threads. */ -#define HV_X64_NO_NONARCH_CORESHARING BIT(18) +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ #define HV_X64_NESTED_DIRECT_FLUSH BIT(17) @@ -295,43 +249,6 @@ union hv_x64_msr_hypercall_contents { } __packed; }; -/* - * TSC page layout. - */ -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; - u64 reserved2[509]; -} __packed; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 - struct hv_reenlightenment_control { __u64 vector:8; __u64 reserved1:8; @@ -355,31 +272,12 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) -/* - * Crash notification (HV_X64_MSR_CRASH_CTL) flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) #define HV_IPI_LOW_VECTOR 0x10 #define HV_IPI_HIGH_VECTOR 0xff -/* Declare the various hypercall operations. */ -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 -#define HVCALL_SEND_IPI 0x000b -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 -#define HVCALL_SEND_IPI_EX 0x0015 -#define HVCALL_POST_MESSAGE 0x005c -#define HVCALL_SIGNAL_EVENT 0x005d -#define HVCALL_RETARGET_INTERRUPT 0x007e -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 - #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ @@ -391,75 +289,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -#define HV_PROCESSOR_POWER_STATE_C0 0 -#define HV_PROCESSOR_POWER_STATE_C1 1 -#define HV_PROCESSOR_POWER_STATE_C2 2 -#define HV_PROCESSOR_POWER_STATE_C3 3 - -#define HV_FLUSH_ALL_PROCESSORS BIT(0) -#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) -#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) - -enum HV_GENERIC_SET_FORMAT { - HV_GENERIC_SET_SPARSE_4K, - HV_GENERIC_SET_ALL, -}; - -#define HV_PARTITION_ID_SELF ((u64)-1) - -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) - -/* hypercall status code */ -#define HV_STATUS_SUCCESS 0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 -#define HV_STATUS_INVALID_ALIGNMENT 4 -#define HV_STATUS_INVALID_PARAMETER 5 -#define HV_STATUS_INSUFFICIENT_MEMORY 11 -#define HV_STATUS_INVALID_PORT_ID 17 -#define HV_STATUS_INVALID_CONNECTION_ID 18 -#define HV_STATUS_INSUFFICIENT_BUFFERS 19 - -/* - * The Hyper-V TimeRefCount register and the TSC - * page provide a guest VM clock with 100ns tick rate - */ -#define HV_CLOCK_HZ (NSEC_PER_SEC/100) - -typedef struct _HV_REFERENCE_TSC_PAGE { - __u32 tsc_sequence; - __u32 res1; - __u64 tsc_scale; - __s64 tsc_offset; -} __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; - -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) -/* Valid SynIC vectors are 16-255. */ -#define HV_SYNIC_FIRST_VALID_VECTOR (16) - -#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) -#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) -#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) -#define HV_SYNIC_SINT_MASKED (1ULL << 16) -#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) -#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) - -#define HV_SYNIC_STIMER_COUNT (4) - -/* Define synthetic interrupt controller message constants. */ -#define HV_MESSAGE_SIZE (256) -#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) -#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) /* Define hypervisor message types. */ enum hv_message_type { @@ -470,76 +299,25 @@ enum hv_message_type { HVMSG_GPA_INTERCEPT = 0x80000001, /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, + HVMSG_TIMER_EXPIRED = 0x80000010, /* Error messages. */ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, /* Trace buffer complete messages. */ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 -}; - -/* Define synthetic interrupt controller message flags. */ -union hv_message_flags { - __u8 asu8; - struct { - __u8 msg_pending:1; - __u8 reserved:7; - } __packed; -}; - -/* Define port identifier type. */ -union hv_port_id { - __u32 asu32; - struct { - __u32 id:24; - __u32 reserved:8; - } __packed u; + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define synthetic interrupt controller message header. */ -struct hv_message_header { - __u32 message_type; - __u8 payload_size; - union hv_message_flags message_flags; - __u8 reserved[2]; - union { - __u64 sender; - union hv_port_id port; - }; -} __packed; - -/* Define synthetic interrupt controller message format. */ -struct hv_message { - struct hv_message_header header; - union { - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; - } u; -} __packed; - -/* Define the synthetic interrupt message page layout. */ -struct hv_message_page { - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -} __packed; - -/* Define timer message payload structure. */ -struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ -} __packed; - struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; @@ -767,187 +545,11 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -/* Define synthetic interrupt controller flag constants. */ -#define HV_EVENT_FLAGS_COUNT (256 * 8) -#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) - -/* - * Synthetic timer configuration. - */ -union hv_stimer_config { - u64 as_uint64; - struct { - u64 enable:1; - u64 periodic:1; - u64 lazy:1; - u64 auto_enable:1; - u64 apic_vector:8; - u64 direct_mode:1; - u64 reserved_z0:3; - u64 sintx:4; - u64 reserved_z1:44; - } __packed; -}; - - -/* Define the synthetic interrupt controller event flags format. */ -union hv_synic_event_flags { - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; -}; - -/* Define SynIC control register. */ -union hv_synic_scontrol { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:63; - } __packed; -}; - -/* Define synthetic interrupt source. */ -union hv_synic_sint { - u64 as_uint64; - struct { - u64 vector:8; - u64 reserved1:8; - u64 masked:1; - u64 auto_eoi:1; - u64 polling:1; - u64 reserved2:45; - } __packed; -}; - -/* Define the format of the SIMP register */ -union hv_synic_simp { - u64 as_uint64; - struct { - u64 simp_enabled:1; - u64 preserved:11; - u64 base_simp_gpa:52; - } __packed; -}; - -/* Define the format of the SIEFP register */ -union hv_synic_siefp { - u64 as_uint64; - struct { - u64 siefp_enabled:1; - u64 preserved:11; - u64 base_siefp_gpa:52; - } __packed; -}; - -struct hv_vpset { - u64 format; - u64 valid_bank_mask; - u64 bank_contents[]; -} __packed; - -/* HvCallSendSyntheticClusterIpi hypercall */ -struct hv_send_ipi { - u32 vector; - u32 reserved; - u64 cpu_mask; -} __packed; - -/* HvCallSendSyntheticClusterIpiEx hypercall */ -struct hv_send_ipi_ex { - u32 vector; - u32 reserved; - struct hv_vpset vp_set; -} __packed; - -/* HvFlushGuestPhysicalAddressSpace hypercalls */ -struct hv_guest_mapping_flush { - u64 address_space; - u64 flags; -} __packed; - -/* - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. - */ -#define HV_MAX_FLUSH_PAGES (2048) - -/* HvFlushGuestPhysicalAddressList hypercall */ -union hv_gpa_page_range { - u64 address_space; - struct { - u64 additional_pages:11; - u64 largepage:1; - u64 basepfn:52; - } page; -}; - -/* - * All input flush parameters should be in single page. The max flush - * count is equal with how many entries of union hv_gpa_page_range can - * be populated into the input parameter page. - */ -#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ - sizeof(union hv_gpa_page_range)) - -struct hv_guest_mapping_flush_list { - u64 address_space; - u64 flags; - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; -}; - -/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ -struct hv_tlb_flush { - u64 address_space; - u64 flags; - u64 processor_mask; - u64 gva_list[]; -} __packed; - -/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ -struct hv_tlb_flush_ex { - u64 address_space; - u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; -} __packed; - struct hv_partition_assist_pg { u32 tlb_lock_count; }; -union hv_msi_entry { - u64 as_uint64; - struct { - u32 address; - u32 data; - } __packed; -}; - -struct hv_interrupt_entry { - u32 source; /* 1 for MSI(-X) */ - u32 reserved1; - union hv_msi_entry msi_entry; -} __packed; -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vpset vp_set; - }; -} __packed; +#include <asm-generic/hyperv-tlfs.h> -/* HvRetargetDeviceInterrupt hypercall */ -struct hv_retarget_device_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed __aligned(8); #endif diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h new file mode 100644 index 000000000000..d203c541a65a --- /dev/null +++ b/arch/x86/include/asm/idtentry.h @@ -0,0 +1,652 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IDTENTRY_H +#define _ASM_X86_IDTENTRY_H + +/* Interrupts/Exceptions */ +#include <asm/trapnr.h> + +#ifndef __ASSEMBLY__ +#include <linux/hardirq.h> + +#include <asm/irq_stack.h> + +void idtentry_enter_user(struct pt_regs *regs); +void idtentry_exit_user(struct pt_regs *regs); + +bool idtentry_enter_cond_rcu(struct pt_regs *regs); +void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); + +/** + * DECLARE_IDTENTRY - Declare functions for simple IDT entry points + * No error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it + * declares the entry points for usage in C code. There is an ASM variant + * as well which is used to emit the entry stubs in entry_32/64.S. + */ +#define DECLARE_IDTENTRY(vector, func) \ + asmlinkage void asm_##func(void); \ + asmlinkage void xen_asm_##func(void); \ + __visible void func(struct pt_regs *regs) + +/** + * DEFINE_IDTENTRY - Emit code for simple IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * idtentry_enter() contains common code which has to be invoked before + * arbitrary code in the body. idtentry_exit() contains common code + * which has to run before returning to the low level assembly code. + */ +#define DEFINE_IDTENTRY(func) \ +static __always_inline void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + __##func (regs); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs) + +/* Special case for 32bit IRET 'trap' */ +#define DECLARE_IDTENTRY_SW DECLARE_IDTENTRY +#define DEFINE_IDTENTRY_SW DEFINE_IDTENTRY + +/** + * DECLARE_IDTENTRY_ERRORCODE - Declare functions for simple IDT entry points + * Error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Same as DECLARE_IDTENTRY, but has an extra error_code argument for the + * C-handler. + */ +#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ + asmlinkage void asm_##func(void); \ + asmlinkage void xen_asm_##func(void); \ + __visible void func(struct pt_regs *regs, unsigned long error_code) + +/** + * DEFINE_IDTENTRY_ERRORCODE - Emit code for simple IDT entry points + * Error code pushed by hardware + * @func: Function name of the entry point + * + * Same as DEFINE_IDTENTRY, but has an extra error_code argument + */ +#define DEFINE_IDTENTRY_ERRORCODE(func) \ +static __always_inline void __##func(struct pt_regs *regs, \ + unsigned long error_code); \ + \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + __##func (regs, error_code); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs, \ + unsigned long error_code) + +/** + * DECLARE_IDTENTRY_RAW - Declare functions for raw IDT entry points + * No error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY(). + */ +#define DECLARE_IDTENTRY_RAW(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +/** + * DEFINE_IDTENTRY_RAW - Emit code for raw IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * Contrary to DEFINE_IDTENTRY() this does not invoke the + * idtentry_enter/exit() helpers before and after the body invocation. This + * needs to be done in the body itself if applicable. Use if extra work + * is required before the enter/exit() helpers are invoked. + */ +#define DEFINE_IDTENTRY_RAW(func) \ +__visible noinstr void func(struct pt_regs *regs) + +/** + * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points + * Error code pushed by hardware + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_ERRORCODE() + */ +#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_RAW_ERRORCODE - Emit code for raw IDT entry points + * @func: Function name of the entry point + * + * @func is called from ASM entry code with interrupts disabled. + * + * The macro is written so it acts as function definition. Append the + * body with a pair of curly brackets. + * + * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the + * idtentry_enter/exit() helpers before and after the body invocation. This + * needs to be done in the body itself if applicable. Use if extra work + * is required before the enter/exit() helpers are invoked. + */ +#define DEFINE_IDTENTRY_RAW_ERRORCODE(func) \ +__visible noinstr void func(struct pt_regs *regs, unsigned long error_code) + +/** + * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry + * points (common/spurious) + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_ERRORCODE() + */ +#define DECLARE_IDTENTRY_IRQ(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points + * @func: Function name of the entry point + * + * The vector number is pushed by the low level entry stub and handed + * to the function as error_code argument which needs to be truncated + * to an u8 because the push is sign extending. + * + * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before + * and after switching to the interrupt stack. On 32-bit this happens in C. + * + * irq_enter/exit_rcu() are invoked before the function body and the + * KVM L1D flush request is set. + */ +#define DEFINE_IDTENTRY_IRQ(func) \ +static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ + \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + irq_enter_rcu(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs, (u8)error_code); \ + irq_exit_rcu(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs, u8 vector) + +/** + * DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Maps to DECLARE_IDTENTRY(). + */ +#define DECLARE_IDTENTRY_SYSVEC(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +/** + * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points + * @func: Function name of the entry point + * + * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the + * function body. KVM L1D flush request is set. + * + * Runs the function on the interrupt stack if the entry hit kernel mode + */ +#define DEFINE_IDTENTRY_SYSVEC(func) \ +static void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + irq_enter_rcu(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_on_irqstack_cond(__##func, regs, regs); \ + irq_exit_rcu(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static noinline void __##func(struct pt_regs *regs) + +/** + * DEFINE_IDTENTRY_SYSVEC_SIMPLE - Emit code for simple system vector IDT + * entry points + * @func: Function name of the entry point + * + * Runs the function on the interrupted stack. No switch to IRQ stack and + * only the minimal __irq_enter/exit() handling. + * + * Only use for 'empty' vectors like reschedule IPI and KVM posted + * interrupt vectors. + */ +#define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ +static __always_inline void __##func(struct pt_regs *regs); \ + \ +__visible noinstr void func(struct pt_regs *regs) \ +{ \ + bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + \ + instrumentation_begin(); \ + __irq_enter_raw(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs); \ + __irq_exit_raw(); \ + instrumentation_end(); \ + idtentry_exit_cond_rcu(regs, rcu_exit); \ +} \ + \ +static __always_inline void __##func(struct pt_regs *regs) + +/** + * DECLARE_IDTENTRY_XENCB - Declare functions for XEN HV callback entry point + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares three functions: + * - The ASM entry point: asm_##func + * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the ASM entry point + * + * Maps to DECLARE_IDTENTRY(). Distinct entry point to handle the 32/64-bit + * difference + */ +#define DECLARE_IDTENTRY_XENCB(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +#ifdef CONFIG_X86_64 +/** + * DECLARE_IDTENTRY_IST - Declare functions for IST handling IDT entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW, but declares also the NOIST C handler + * which is called from the ASM entry point on user mode entry + */ +#define DECLARE_IDTENTRY_IST(vector, func) \ + DECLARE_IDTENTRY_RAW(vector, func); \ + __visible void noist_##func(struct pt_regs *regs) + +/** + * DEFINE_IDTENTRY_IST - Emit code for IST entry points + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW + */ +#define DEFINE_IDTENTRY_IST(func) \ + DEFINE_IDTENTRY_RAW(func) + +/** + * DEFINE_IDTENTRY_NOIST - Emit code for NOIST entry points which + * belong to a IST entry point (MCE, DB) + * @func: Function name of the entry point. Must be the same as + * the function name of the corresponding IST variant + * + * Maps to DEFINE_IDTENTRY_RAW(). + */ +#define DEFINE_IDTENTRY_NOIST(func) \ + DEFINE_IDTENTRY_RAW(noist_##func) + +/** + * DECLARE_IDTENTRY_DF - Declare functions for double fault + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE + */ +#define DECLARE_IDTENTRY_DF(vector, func) \ + DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) + +/** + * DEFINE_IDTENTRY_DF - Emit code for double fault + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_DF(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(func) + +#else /* CONFIG_X86_64 */ + +/* Maps to a regular IDTENTRY on 32bit for now */ +# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY +# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY + +/** + * DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Declares two functions: + * - The ASM entry point: asm_##func + * - The C handler called from the C shim + */ +#define DECLARE_IDTENTRY_DF(vector, func) \ + asmlinkage void asm_##func(void); \ + __visible void func(struct pt_regs *regs, \ + unsigned long error_code, \ + unsigned long address) + +/** + * DEFINE_IDTENTRY_DF - Emit code for double fault on 32bit + * @func: Function name of the entry point + * + * This is called through the doublefault shim which already provides + * cr2 in the address argument. + */ +#define DEFINE_IDTENTRY_DF(func) \ +__visible noinstr void func(struct pt_regs *regs, \ + unsigned long error_code, \ + unsigned long address) + +#endif /* !CONFIG_X86_64 */ + +/* C-Code mapping */ +#define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST +#define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST + +#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_IST + +#define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST +#define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST +#define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST + +/** + * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM + * indirection magic. + */ +#define DECLARE_IDTENTRY_XEN(vector, func) \ + asmlinkage void xen_asm_exc_xen##func(void); \ + asmlinkage void asm_exc_xen##func(void) + +#else /* !__ASSEMBLY__ */ + +/* + * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs. + */ +#define DECLARE_IDTENTRY(vector, func) \ + idtentry vector asm_##func func has_error_code=0 + +#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ + idtentry vector asm_##func func has_error_code=1 + +/* Special case for 32bit IRET 'trap'. Do not emit ASM code */ +#define DECLARE_IDTENTRY_SW(vector, func) + +#define DECLARE_IDTENTRY_RAW(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +#define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func) \ + DECLARE_IDTENTRY_ERRORCODE(vector, func) + +/* Entries for common/spurious (device) interrupts */ +#define DECLARE_IDTENTRY_IRQ(vector, func) \ + idtentry_irq vector func + +/* System vector entries */ +#define DECLARE_IDTENTRY_SYSVEC(vector, func) \ + idtentry_sysvec vector func + +#ifdef CONFIG_X86_64 +# define DECLARE_IDTENTRY_MCE(vector, func) \ + idtentry_mce_db vector asm_##func func + +# define DECLARE_IDTENTRY_DEBUG(vector, func) \ + idtentry_mce_db vector asm_##func func + +# define DECLARE_IDTENTRY_DF(vector, func) \ + idtentry_df vector asm_##func func + +# define DECLARE_IDTENTRY_XENCB(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +#else +# define DECLARE_IDTENTRY_MCE(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +# define DECLARE_IDTENTRY_DEBUG(vector, func) \ + DECLARE_IDTENTRY(vector, func) + +/* No ASM emitted for DF as this goes through a C shim */ +# define DECLARE_IDTENTRY_DF(vector, func) + +/* No ASM emitted for XEN hypervisor callback */ +# define DECLARE_IDTENTRY_XENCB(vector, func) + +#endif + +/* No ASM code emitted for NMI */ +#define DECLARE_IDTENTRY_NMI(vector, func) + +/* XEN NMI and DB wrapper */ +#define DECLARE_IDTENTRY_XEN(vector, func) \ + idtentry vector asm_exc_xen##func exc_##func has_error_code=0 + +/* + * ASM code to emit the common vector entry stubs where each stub is + * packed into 8 bytes. + * + * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because + * GCC treats the local vector variable as unsigned int and would expand + * all vectors above 0x7F to a 5 byte push. The original code did an + * adjustment of the vector number to be in the signed byte range to avoid + * this. While clever it's mindboggling counterintuitive and requires the + * odd conversion back to a real vector number in the C entry points. Using + * .byte achieves the same thing and the only fixup needed in the C entry + * point is to mask off the bits above bit 7 because the push is sign + * extending. + */ + .align 8 +SYM_CODE_START(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + pos = . + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS + .byte 0x6a, vector + jmp asm_common_interrupt + nop + /* Ensure that the above is 8 bytes max */ + . = pos + 8 + pos=pos+8 + vector=vector+1 + .endr +SYM_CODE_END(irq_entries_start) + +#ifdef CONFIG_X86_LOCAL_APIC + .align 8 +SYM_CODE_START(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + pos = . + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + UNWIND_HINT_IRET_REGS + .byte 0x6a, vector + jmp asm_spurious_interrupt + nop + /* Ensure that the above is 8 bytes max */ + . = pos + 8 + pos=pos+8 + vector=vector+1 + .endr +SYM_CODE_END(spurious_entries_start) +#endif + +#endif /* __ASSEMBLY__ */ + +/* + * The actual entry points. Note that DECLARE_IDTENTRY*() serves two + * purposes: + * - provide the function declarations when included from C-Code + * - emit the ASM stubs when included from entry_32/64.S + * + * This avoids duplicate defines and ensures that everything is consistent. + */ + +/* + * Dummy trap number so the low level ASM macro vector number checks do not + * match which results in emitting plain IDTENTRY stubs without bells and + * whistels. + */ +#define X86_TRAP_OTHER 0xFFFF + +/* Simple exception entry points. No hardware error code */ +DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error); +DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow); +DECLARE_IDTENTRY(X86_TRAP_BR, exc_bounds); +DECLARE_IDTENTRY(X86_TRAP_UD, exc_invalid_op); +DECLARE_IDTENTRY(X86_TRAP_NM, exc_device_not_available); +DECLARE_IDTENTRY(X86_TRAP_OLD_MF, exc_coproc_segment_overrun); +DECLARE_IDTENTRY(X86_TRAP_SPURIOUS, exc_spurious_interrupt_bug); +DECLARE_IDTENTRY(X86_TRAP_MF, exc_coprocessor_error); +DECLARE_IDTENTRY(X86_TRAP_XF, exc_simd_coprocessor_error); + +/* 32bit software IRET trap. Do not emit ASM code */ +DECLARE_IDTENTRY_SW(X86_TRAP_IRET, iret_error); + +/* Simple exception entries with error code pushed by hardware */ +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); +DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); + +/* Raw exception entries which need extra work */ +DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); +DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); + +#ifdef CONFIG_X86_MCE +DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); +#endif + +/* NMI */ +DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); +DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi); + +/* #DB */ +DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug); +DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug); + +/* #DF */ +DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); + +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); +#endif + +/* Device interrupts common/spurious */ +DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); +#ifdef CONFIG_X86_LOCAL_APIC +DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, spurious_interrupt); +#endif + +/* System vector entry points */ +#ifdef CONFIG_X86_LOCAL_APIC +DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt); +DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR, sysvec_spurious_apic_interrupt); +DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR, sysvec_apic_timer_interrupt); +DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); +#endif + +#ifdef CONFIG_SMP +DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); +DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); +DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); +DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); +DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +# ifdef CONFIG_X86_UV +DECLARE_IDTENTRY_SYSVEC(UV_BAU_MESSAGE, sysvec_uv_bau_message); +# endif + +# ifdef CONFIG_X86_MCE_THRESHOLD +DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# endif + +# ifdef CONFIG_X86_MCE_AMD +DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# endif + +# ifdef CONFIG_X86_THERMAL_VECTOR +DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# endif + +# ifdef CONFIG_IRQ_WORK +DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# endif +#endif + +#ifdef CONFIG_HAVE_KVM +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); +DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#endif + +#if IS_ENABLED(CONFIG_HYPERV) +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0); +#endif + +#if IS_ENABLED(CONFIG_ACRN_GUEST) +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); +#endif + +#ifdef CONFIG_XEN_PVHVM +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); +#endif + +#undef X86_TRAP_OTHER + +#endif diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 8e5af119dc2d..de58391bdee0 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -88,11 +88,17 @@ static inline bool intel_mid_has_msic(void) return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL); } +extern void intel_scu_devices_create(void); +extern void intel_scu_devices_destroy(void); + #else /* !CONFIG_X86_INTEL_MID */ #define intel_mid_identify_cpu() 0 #define intel_mid_has_msic() 0 +static inline void intel_scu_devices_create(void) { } +static inline void intel_scu_devices_destroy(void) { } + #endif /* !CONFIG_X86_INTEL_MID */ enum intel_mid_timer_options { @@ -115,9 +121,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern void intel_scu_devices_create(void); -extern void intel_scu_devices_destroy(void); - /* VRTC timer */ #define MRST_VRTC_MAP_SZ 1024 /* #define MRST_VRTC_PGOFFSET 0xc00 */ diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h deleted file mode 100644 index e6da1ce26256..000000000000 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_PMC_IPC_H_ -#define _ASM_X86_INTEL_PMC_IPC_H_ - -/* Commands */ -#define PMC_IPC_PMIC_ACCESS 0xFF -#define PMC_IPC_PMIC_ACCESS_READ 0x0 -#define PMC_IPC_PMIC_ACCESS_WRITE 0x1 -#define PMC_IPC_USB_PWR_CTRL 0xF0 -#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF -#define PMC_IPC_PHY_CONFIG 0xEE -#define PMC_IPC_NORTHPEAK_CTRL 0xED -#define PMC_IPC_PM_DEBUG 0xEC -#define PMC_IPC_PMC_TELEMTRY 0xEB -#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA - -/* IPC return code */ -#define IPC_ERR_NONE 0 -#define IPC_ERR_CMD_NOT_SUPPORTED 1 -#define IPC_ERR_CMD_NOT_SERVICED 2 -#define IPC_ERR_UNABLE_TO_SERVICE 3 -#define IPC_ERR_CMD_INVALID 4 -#define IPC_ERR_CMD_FAILED 5 -#define IPC_ERR_EMSECURITY 6 -#define IPC_ERR_UNSIGNEDKERNEL 7 - -/* GCR reg offsets from gcr base*/ -#define PMC_GCR_PMC_CFG_REG 0x08 -#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 -#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 - -#if IS_ENABLED(CONFIG_INTEL_PMC_IPC) - -int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, - u32 *out, u32 outlen); -int intel_pmc_s0ix_counter_read(u64 *data); -int intel_pmc_gcr_read64(u32 offset, u64 *data); - -#else - -static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, - u32 *out, u32 outlen) -{ - return -EINVAL; -} - -static inline int intel_pmc_s0ix_counter_read(u64 *data) -{ - return -EINVAL; -} - -static inline int intel_pmc_gcr_read64(u32 offset, u64 *data) -{ - return -EINVAL; -} - -#endif /*CONFIG_INTEL_PMC_IPC*/ - -#endif diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 2a1442ba6e78..11d457af68c5 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -2,61 +2,69 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ -#include <linux/notifier.h> - -#define IPCMSG_INDIRECT_READ 0x02 -#define IPCMSG_INDIRECT_WRITE 0x05 - -#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ - -#define IPCMSG_WARM_RESET 0xF0 -#define IPCMSG_COLD_RESET 0xF1 -#define IPCMSG_SOFT_RESET 0xF2 -#define IPCMSG_COLD_BOOT 0xF3 - -#define IPCMSG_VRTC 0xFA /* Set vRTC device */ - /* Command id associated with message IPCMSG_VRTC */ - #define IPC_CMD_VRTC_SETTIME 1 /* Set time */ - #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ - -/* Read single register */ -int intel_scu_ipc_ioread8(u16 addr, u8 *data); - -/* Read a vector */ -int intel_scu_ipc_readv(u16 *addr, u8 *data, int len); - -/* Write single register */ -int intel_scu_ipc_iowrite8(u16 addr, u8 data); - -/* Write a vector */ -int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); - -/* Update single register based on the mask */ -int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); - -/* Issue commands to the SCU with or without data */ -int intel_scu_ipc_simple_command(int cmd, int sub); -int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen); - -extern struct blocking_notifier_head intel_scu_notifier; - -static inline void intel_scu_notifier_add(struct notifier_block *nb) -{ - blocking_notifier_chain_register(&intel_scu_notifier, nb); -} - -static inline void intel_scu_notifier_remove(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&intel_scu_notifier, nb); -} - -static inline int intel_scu_notifier_post(unsigned long v, void *p) +#include <linux/ioport.h> + +struct device; +struct intel_scu_ipc_dev; + +/** + * struct intel_scu_ipc_data - Data used to configure SCU IPC + * @mem: Base address of SCU IPC MMIO registers + * @irq: The IRQ number used for SCU (optional) + */ +struct intel_scu_ipc_data { + struct resource mem; + int irq; +}; + +struct intel_scu_ipc_dev * +__intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define intel_scu_ipc_register(parent, scu_data) \ + __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + +void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu); + +struct intel_scu_ipc_dev * +__devm_intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define devm_intel_scu_ipc_register(parent, scu_data) \ + __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + +struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); +void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); +struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); + +int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 *data); +int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data); +int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); +int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); + +int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data, u8 mask); + +int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub); +int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + size_t size, void *out, size_t outlen); + +static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + void *out, size_t outlen) { - return blocking_notifier_call_chain(&intel_scu_notifier, v, p); + return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen, + inlen, out, outlen); } -#define SCU_AVAILABLE 1 -#define SCU_DOWN 2 +#include <asm/intel_scu_ipc_legacy.h> #endif diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h new file mode 100644 index 000000000000..4cf13fecb673 --- /dev/null +++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ +#define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ + +#include <linux/notifier.h> + +#define IPCMSG_INDIRECT_READ 0x02 +#define IPCMSG_INDIRECT_WRITE 0x05 + +#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ + +#define IPCMSG_WARM_RESET 0xF0 +#define IPCMSG_COLD_RESET 0xF1 +#define IPCMSG_SOFT_RESET 0xF2 +#define IPCMSG_COLD_BOOT 0xF3 + +#define IPCMSG_VRTC 0xFA /* Set vRTC device */ +/* Command id associated with message IPCMSG_VRTC */ +#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ +#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ + +/* Don't call these in new code - they will be removed eventually */ + +/* Read single register */ +static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data) +{ + return intel_scu_ipc_dev_ioread8(NULL, addr, data); +} + +/* Read a vector */ +static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len) +{ + return intel_scu_ipc_dev_readv(NULL, addr, data, len); +} + +/* Write single register */ +static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data) +{ + return intel_scu_ipc_dev_iowrite8(NULL, addr, data); +} + +/* Write a vector */ +static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len) +{ + return intel_scu_ipc_dev_writev(NULL, addr, data, len); +} + +/* Update single register based on the mask */ +static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask) +{ + return intel_scu_ipc_dev_update(NULL, addr, data, mask); +} + +/* Issue commands to the SCU with or without data */ +static inline int intel_scu_ipc_simple_command(int cmd, int sub) +{ + return intel_scu_ipc_dev_simple_command(NULL, cmd, sub); +} + +static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, + u32 *out, int outlen) +{ + /* New API takes both inlen and outlen as bytes so convert here */ + size_t inbytes = inlen * sizeof(u32); + size_t outbytes = outlen * sizeof(u32); + + return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes, + inlen, out, outbytes); +} + +extern struct blocking_notifier_head intel_scu_notifier; + +static inline void intel_scu_notifier_add(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&intel_scu_notifier, nb); +} + +static inline void intel_scu_notifier_remove(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&intel_scu_notifier, nb); +} + +static inline int intel_scu_notifier_post(unsigned long v, void *p) +{ + return blocking_notifier_call_chain(&intel_scu_notifier, v, p); +} + +#define SCU_AVAILABLE 1 +#define SCU_DOWN 2 + +#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 2f77e31a1283..8046e70dfd7c 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -10,6 +10,8 @@ #define TELEM_MAX_EVENTS_SRAM 28 #define TELEM_MAX_OS_ALLOCATED_EVENTS 20 +#include <asm/intel_scu_ipc.h> + enum telemetry_unit { TELEM_PSS = 0, TELEM_IOSS, @@ -51,6 +53,8 @@ struct telemetry_plt_config { struct telemetry_unit_config ioss_config; struct mutex telem_trace_lock; struct mutex telem_lock; + struct intel_pmc_dev *pmc; + struct intel_scu_ipc_dev *scu; bool telem_in_use; }; @@ -92,7 +96,7 @@ int telemetry_set_pltdata(const struct telemetry_core_ops *ops, int telemetry_clear_pltdata(void); -int telemetry_pltconfig_valid(void); +struct telemetry_plt_config *telemetry_get_pltdata(void); int telemetry_get_evtname(enum telemetry_unit telem_unit, const char **name, int len); diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h index 989cfa86de85..734482afbf81 100644 --- a/arch/x86/include/asm/invpcid.h +++ b/arch/x86/include/asm/invpcid.h @@ -12,12 +12,9 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * stale TLB entries and, especially if we're flushing global * mappings, we don't want the compiler to reorder any subsequent * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); + asm volatile("invpcid %[desc], %[type]" + :: [desc] "m" (desc), [type] "r" (type) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index 07344d82e88e..ac1a99ffbd8d 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -17,7 +17,7 @@ struct task_struct; #ifdef CONFIG_X86_IOPL_IOPERM void io_bitmap_share(struct task_struct *tsk); -void io_bitmap_exit(void); +void io_bitmap_exit(struct task_struct *tsk); void native_tss_update_io_bitmap(void); @@ -29,7 +29,7 @@ void native_tss_update_io_bitmap(void); #else static inline void io_bitmap_share(struct task_struct *tsk) { } -static inline void io_bitmap_exit(void) { } +static inline void io_bitmap_exit(struct task_struct *tsk) { } static inline void tss_update_io_bitmap(void) { } #endif diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 2a7b3211ee7a..bacf68c4d70e 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -10,7 +10,6 @@ #include <linux/mm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> void __iomem * diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 72fba0eeeb30..528c8a71fe7f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -11,6 +11,13 @@ #include <asm/apicdef.h> #include <asm/irq_vectors.h> +/* + * The irq entry code is in the noinstr section and the start/end of + * __irqentry_text is emitted via labels. Make the build fail if + * something moves a C function into the __irq_entry section. + */ +#define __irq_entry __invalid_section + static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); @@ -26,17 +33,14 @@ extern void fixup_irqs(void); #ifdef CONFIG_HAVE_KVM extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); -extern __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs); -extern __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs); -extern __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs); #endif extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs); +extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs); +extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); extern void init_ISA_irqs(void); @@ -46,7 +50,6 @@ extern void __init init_IRQ(void); void arch_trigger_cpumask_backtrace(const struct cpumask *mask, bool exclude_self); -extern __visible void smp_x86_platform_ipi(struct pt_regs *regs); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h deleted file mode 100644 index 187ce59aea28..000000000000 --- a/arch/x86/include/asm/irq_regs.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Per-cpu current frame pointer - the location of the last exception frame on - * the stack, stored in the per-cpu area. - * - * Jeremy Fitzhardinge <jeremy@goop.org> - */ -#ifndef _ASM_X86_IRQ_REGS_H -#define _ASM_X86_IRQ_REGS_H - -#include <asm/percpu.h> - -#define ARCH_HAS_OWN_IRQ_REGS - -DECLARE_PER_CPU(struct pt_regs *, irq_regs); - -static inline struct pt_regs *get_irq_regs(void) -{ - return __this_cpu_read(irq_regs); -} - -static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) -{ - struct pt_regs *old_regs; - - old_regs = get_irq_regs(); - __this_cpu_write(irq_regs, new_regs); - - return old_regs; -} - -#endif /* _ASM_X86_IRQ_REGS_32_H */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h new file mode 100644 index 000000000000..4ae66f097101 --- /dev/null +++ b/arch/x86/include/asm/irq_stack.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_IRQ_STACK_H +#define _ASM_X86_IRQ_STACK_H + +#include <linux/ptrace.h> + +#include <asm/processor.h> + +#ifdef CONFIG_X86_64 +static __always_inline bool irqstack_active(void) +{ + return __this_cpu_read(irq_count) != -1; +} + +void asm_call_on_stack(void *sp, void *func, void *arg); + +static __always_inline void __run_on_irqstack(void *func, void *arg) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_on_stack(tos - 8, func, arg); + __this_cpu_sub(irq_count, 1); +} + +#else /* CONFIG_X86_64 */ +static inline bool irqstack_active(void) { return false; } +static inline void __run_on_irqstack(void *func, void *arg) { } +#endif /* !CONFIG_X86_64 */ + +static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return false; + if (!regs) + return !irqstack_active(); + return !user_mode(regs) && !irqstack_active(); +} + +static __always_inline void run_on_irqstack_cond(void *func, void *arg, + struct pt_regs *regs) +{ + void (*__func)(void *arg) = func; + + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_on_irqstack(__func, arg); + else + __func(arg); +} + +#endif diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 80b35e3adf03..800ffce0db29 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -10,7 +10,6 @@ static inline bool arch_irq_work_has_interrupt(void) return boot_cpu_has(X86_FEATURE_APIC); } extern void arch_irq_work_raise(void); -extern __visible void smp_irq_work_interrupt(struct pt_regs *regs); #else static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 8a0e56e1dcc9..02a0cf547d7b 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -17,7 +17,7 @@ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ extern inline unsigned long native_save_fl(void); -extern inline unsigned long native_save_fl(void) +extern __always_inline unsigned long native_save_fl(void) { unsigned long flags; @@ -44,12 +44,12 @@ extern inline void native_restore_fl(unsigned long flags) :"memory", "cc"); } -static inline void native_irq_disable(void) +static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static inline void native_irq_enable(void) +static __always_inline void native_irq_enable(void) { asm volatile("sti": : :"memory"); } @@ -74,22 +74,22 @@ static inline __cpuidle void native_halt(void) #ifndef __ASSEMBLY__ #include <linux/types.h> -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return native_save_fl(); } -static inline notrace void arch_local_irq_restore(unsigned long flags) +static __always_inline void arch_local_irq_restore(unsigned long flags) { native_restore_fl(flags); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { native_irq_enable(); } @@ -115,7 +115,7 @@ static inline __cpuidle void halt(void) /* * For spinlocks, etc: */ -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); @@ -159,12 +159,12 @@ static inline notrace unsigned long arch_local_irq_save(void) #endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLY__ -static inline int arch_irqs_disabled_flags(unsigned long flags) +static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } -static inline int arch_irqs_disabled(void) +static __always_inline int arch_irqs_disabled(void) { unsigned long flags = arch_local_save_flags(); @@ -172,38 +172,4 @@ static inline int arch_irqs_disabled(void) } #endif /* !__ASSEMBLY__ */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; -#else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# ifdef CONFIG_X86_64 -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -# define LOCKDEP_SYS_EXIT_IRQ \ - TRACE_IRQS_ON; \ - sti; \ - call lockdep_sys_exit_thunk; \ - cli; \ - TRACE_IRQS_OFF; -# else -# define LOCKDEP_SYS_EXIT \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call lockdep_sys_exit; \ - popl %edx; \ - popl %ecx; \ - popl %eax; -# define LOCKDEP_SYS_EXIT_IRQ -# endif -#else -# define LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ -#endif -#endif /* __ASSEMBLY__ */ - #endif diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index db7ba2feb947..0648190467ba 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY void kernel_randomize_memory(void); +void init_trampoline_kaslr(void); #else static inline void kernel_randomize_memory(void) { } +static inline void init_trampoline_kaslr(void) {} #endif /* CONFIG_RANDOMIZE_MEMORY */ #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..1da5858501ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -83,6 +83,10 @@ #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -107,15 +111,8 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -enum { - PT_PAGE_TABLE_LEVEL = 1, - PT_DIRECTORY_LEVEL = 2, - PT_PDPE_LEVEL = 3, - /* set max level to the biggest one */ - PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL, -}; -#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \ - PT_PAGE_TABLE_LEVEL + 1) +#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G +#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) @@ -124,7 +121,7 @@ enum { static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) { - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } @@ -164,9 +161,13 @@ enum kvm_reg { NR_VCPU_REGS, VCPU_EXREG_PDPTR = NR_VCPU_REGS, + VCPU_EXREG_CR0, VCPU_EXREG_CR3, + VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, + VCPU_EXREG_EXIT_INFO_1, + VCPU_EXREG_EXIT_INFO_2, }; enum { @@ -182,8 +183,10 @@ enum { enum exit_fastpath_completion { EXIT_FASTPATH_NONE, - EXIT_FASTPATH_SKIP_EMUL_INS, + EXIT_FASTPATH_REENTER_GUEST, + EXIT_FASTPATH_EXIT_HANDLED, }; +typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; @@ -372,12 +375,12 @@ struct rsvd_bits_validate { }; struct kvm_mmu_root_info { - gpa_t cr3; + gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ - ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 @@ -403,7 +406,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - gpa_t root_cr3; + gpa_t root_pgd; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; @@ -578,6 +581,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 host_pkru; u32 pkru; u32 hflags; u64 efer; @@ -597,6 +601,7 @@ struct kvm_vcpu_arch { u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; + u64 perf_capabilities; /* * Paging state of the vcpu @@ -649,7 +654,6 @@ struct kvm_vcpu_arch { u64 xcr0; u64 guest_supported_xcr0; - u32 guest_xstate_size; struct kvm_pio_request pio; void *pio_data; @@ -679,6 +683,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; + int tdp_level; /* emulate context */ @@ -702,6 +707,7 @@ struct kvm_vcpu_arch { struct gfn_to_pfn_cache cache; } st; + u64 l1_tsc_offset; u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; @@ -761,14 +767,17 @@ struct kvm_vcpu_arch { struct { bool halted; - gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; + gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; - u64 msr_val; + u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */ + u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ + u16 vec; u32 id; bool send_user_only; - u32 host_apf_reason; + u32 host_apf_flags; unsigned long nested_apf_token; bool delivery_as_pf_vmexit; + bool pageready_pending; } apf; /* OSVW MSRs (AMD only) */ @@ -854,6 +863,18 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; +/* Hyper-V synthetic debugger (SynDbg)*/ +struct kvm_hv_syndbg { + struct { + u64 control; + u64 status; + u64 send_page; + u64 recv_page; + u64 pending_page; + } control; + u64 options; +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -865,7 +886,7 @@ struct kvm_hv { u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; - HV_REFERENCE_TSC_PAGE tsc_ref; + struct ms_hyperv_tsc_page tsc_ref; struct idr conn_to_evt; @@ -877,6 +898,7 @@ struct kvm_hv { atomic_t num_mismatched_vp_indexes; struct hv_partition_assist_pg *hv_pa_pg; + struct kvm_hv_syndbg hv_syndbg; }; enum kvm_irqchip_mode { @@ -1027,6 +1049,8 @@ struct kvm_vcpu_stat { u64 irq_injections; u64 nmi_injections; u64 req_event; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; }; struct x86_instruction_info; @@ -1058,7 +1082,7 @@ struct kvm_x86_ops { void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); - bool (*has_emulated_msr)(int index); + bool (*has_emulated_msr)(u32 index); void (*cpuid_update)(struct kvm_vcpu *vcpu); unsigned int vm_size; @@ -1084,8 +1108,6 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); @@ -1093,15 +1115,14 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); - u64 (*get_dr6)(struct kvm_vcpu *vcpu); - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + void (*tlb_flush_all)(struct kvm_vcpu *vcpu); + void (*tlb_flush_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); @@ -1114,7 +1135,13 @@ struct kvm_x86_ops { */ void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); - void (*run)(struct kvm_vcpu *vcpu); + /* + * Flush any TLB entries created by the guest. Like tlb_flush_gva(), + * does not need to flush GPA->HPA mappings. + */ + void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + + enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1127,8 +1154,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - int (*interrupt_allowed)(struct kvm_vcpu *vcpu); - int (*nmi_allowed)(struct kvm_vcpu *vcpu); + int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1142,7 +1169,7 @@ struct kvm_x86_ops { bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); - void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -1154,7 +1181,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); @@ -1164,10 +1190,8 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1200,6 +1224,7 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + const struct kvm_x86_nested_ops *nested_ops; /* * Architecture specific hooks for vCPU blocking due to @@ -1227,18 +1252,10 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - int (*get_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - unsigned user_data_size); - int (*set_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); - - int (*smi_allowed)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); - int (*enable_smi_window)(struct kvm_vcpu *vcpu); + void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1246,14 +1263,28 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version); - uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + + void (*migrate_timers)(struct kvm_vcpu *vcpu); +}; + +struct kvm_x86_nested_ops { + int (*check_events)(struct kvm_vcpu *vcpu); + bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + int (*get_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + + int (*enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); + uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { @@ -1280,8 +1311,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); @@ -1449,9 +1479,12 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); @@ -1479,6 +1512,8 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +void kvm_update_dr7(struct kvm_vcpu *vcpu); + int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); @@ -1509,8 +1544,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, + bool skip_mmu_sync); void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); @@ -1574,8 +1612,6 @@ enum { }; #define HF_GIF_MASK (1 << 0) -#define HF_HIF_MASK (1 << 1) -#define HF_VINTR_MASK (1 << 2) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ @@ -1641,7 +1677,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); @@ -1663,8 +1700,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ - return (irq->delivery_mode == dest_Fixed || - irq->delivery_mode == dest_LowestPrio); + return (irq->delivery_mode == APIC_DM_FIXED || + irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9b4df6eaa11a..49d3a9edb06f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,11 +88,21 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); +void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); -u32 kvm_read_and_reset_pf_reason(void); -extern void kvm_disable_steal_time(void); -void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); +u32 kvm_read_and_reset_apf_flags(void); +void kvm_disable_steal_time(void); +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); + +DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + +static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + if (static_branch_unlikely(&kvm_async_pf_enabled)) + return __kvm_handle_async_pf(regs, token); + else + return false; +} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); @@ -103,7 +113,7 @@ static inline void kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #else /* CONFIG_KVM_GUEST */ -#define kvm_async_pf_task_wait(T, I) do {} while(0) +#define kvm_async_pf_task_wait_schedule(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) @@ -121,7 +131,7 @@ static inline unsigned int kvm_arch_para_hints(void) return 0; } -static inline u32 kvm_read_and_reset_pf_reason(void) +static inline u32 kvm_read_and_reset_apf_flags(void) { return 0; } @@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void) { return; } + +static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + return false; +} #endif #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c598aaab071b..cf503824529c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -251,7 +251,7 @@ extern void mce_disable_bank(int bank); /* * Exception handler */ -void do_machine_check(struct pt_regs *, long); +void do_machine_check(struct pt_regs *pt_regs); /* * Threshold handler diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 9c2447b3555d..9ca760e430b9 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -24,4 +24,7 @@ extern void memtype_free_io(resource_size_t start, resource_size_t end); extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn); +bool x86_has_pat_wp(void); +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot); + #endif /* _ASM_X86_MEMTYPE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 6685e1218959..7063b5a43220 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -41,7 +41,7 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define PATCH_MAX_SIZE PAGE_SIZE +#define PATCH_MAX_SIZE (3 * PAGE_SIZE) #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index bdeae9291e5c..0a301ad0b02f 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -45,7 +45,7 @@ typedef struct { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* * One bit per protection key says whether userspace can - * use it or not. protected by mmap_sem. + * use it or not. protected by mmap_lock. */ u16 pkey_allocation_map; s16 execute_only_pkey; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4e55370e48e8..47562147e70b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -24,21 +24,9 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, #endif /* !CONFIG_PARAVIRT_XXL */ #ifdef CONFIG_PERF_EVENTS - DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); - -static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) -{ - if (static_branch_unlikely(&rdpmc_always_available_key) || - (!static_branch_unlikely(&rdpmc_never_available_key) && - atomic_read(&mm->context.perf_rdpmc_allowed))) - cr4_set_bits_irqsoff(X86_CR4_PCE); - else - cr4_clear_bits_irqsoff(X86_CR4_PCE); -} -#else -static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {} +void cr4_update_pce(void *ignored); #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL @@ -225,78 +213,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -/* - * This can be used from process context to figure out what the value of - * CR3 is without needing to do a (slow) __read_cr3(). - * - * It's intended to be used for code like KVM that sneakily changes CR3 - * and needs to restore it. It needs to be used very carefully. - */ -static inline unsigned long __get_current_cr3_fast(void) -{ - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* For now, be very restrictive about when this can be called. */ - VM_WARN_ON(in_nmi() || preemptible()); - - VM_BUG_ON(cr3 != __read_cr3()); - return cr3; -} - -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +unsigned long __get_current_cr3_fast(void); #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73d8dd14dda2..2d4515e8b7df 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,43 +14,4 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DISCONTIGMEM - -/* - * generic node memory support, the following assumptions apply: - * - * 1) memory comes in 64Mb contiguous chunks which are either present or not - * 2) we will not have more than 64Gb in total - * - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_SECTIONS 1024 -#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS) - -extern s8 physnode_map[]; - -static inline int pfn_to_nid(unsigned long pfn) -{ -#ifdef CONFIG_NUMA - return((int) physnode_map[(pfn) / PAGES_PER_SECTION]); -#else - return 0; -#endif -} - -static inline int pfn_valid(int pfn) -{ - int nid = pfn_to_nid(pfn); - - if (nid >= 0) - return (pfn < node_end_pfn(nid)); - return 0; -} - -#define early_pfn_valid(pfn) pfn_valid((pfn)) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index c215d2762488..e988bac0a4a1 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -13,64 +13,4 @@ struct mod_arch_specific { #endif }; -#ifdef CONFIG_X86_64 -/* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M486SX -#define MODULE_PROC_FAMILY "486SX " -#elif defined CONFIG_M486 -#define MODULE_PROC_FAMILY "486 " -#elif defined CONFIG_M586 -#define MODULE_PROC_FAMILY "586 " -#elif defined CONFIG_M586TSC -#define MODULE_PROC_FAMILY "586TSC " -#elif defined CONFIG_M586MMX -#define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " -#elif defined CONFIG_MATOM -#define MODULE_PROC_FAMILY "ATOM " -#elif defined CONFIG_M686 -#define MODULE_PROC_FAMILY "686 " -#elif defined CONFIG_MPENTIUMII -#define MODULE_PROC_FAMILY "PENTIUMII " -#elif defined CONFIG_MPENTIUMIII -#define MODULE_PROC_FAMILY "PENTIUMIII " -#elif defined CONFIG_MPENTIUMM -#define MODULE_PROC_FAMILY "PENTIUMM " -#elif defined CONFIG_MPENTIUM4 -#define MODULE_PROC_FAMILY "PENTIUM4 " -#elif defined CONFIG_MK6 -#define MODULE_PROC_FAMILY "K6 " -#elif defined CONFIG_MK7 -#define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_MELAN -#define MODULE_PROC_FAMILY "ELAN " -#elif defined CONFIG_MCRUSOE -#define MODULE_PROC_FAMILY "CRUSOE " -#elif defined CONFIG_MEFFICEON -#define MODULE_PROC_FAMILY "EFFICEON " -#elif defined CONFIG_MWINCHIPC6 -#define MODULE_PROC_FAMILY "WINCHIPC6 " -#elif defined CONFIG_MWINCHIP3D -#define MODULE_PROC_FAMILY "WINCHIP3D " -#elif defined CONFIG_MCYRIXIII -#define MODULE_PROC_FAMILY "CYRIXIII " -#elif defined CONFIG_MVIAC3_2 -#define MODULE_PROC_FAMILY "VIAC3-2 " -#elif defined CONFIG_MVIAC7 -#define MODULE_PROC_FAMILY "VIAC7 " -#elif defined CONFIG_MGEODEGX1 -#define MODULE_PROC_FAMILY "GEODEGX1 " -#elif defined CONFIG_MGEODE_LX -#define MODULE_PROC_FAMILY "GEODE " -#else -#error unknown processor family -#endif - -#ifdef CONFIG_X86_32 -# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY -#endif - #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 1c42ecbe75cb..60b944dd2df1 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -35,6 +35,8 @@ typedef int (*hyperv_fill_flush_list_func)( rdmsrl(HV_X64_MSR_SINT0 + int_num, val) #define hv_set_synint_state(int_num, val) \ wrmsrl(HV_X64_MSR_SINT0 + int_num, val) +#define hv_recommend_using_aeoi() \ + (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)) #define hv_get_crash_ctl(val) \ rdmsrl(HV_X64_MSR_CRASH_CTL, val) @@ -52,20 +54,8 @@ typedef int (*hyperv_fill_flush_list_func)( vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() -void hyperv_callback_vector(void); -void hyperv_reenlightenment_vector(void); -#ifdef CONFIG_TRACING -#define trace_hyperv_callback_vector hyperv_callback_vector -#endif void hyperv_vector_handler(struct pt_regs *regs); -/* - * Routines for stimer0 Direct Mode handling. - * On x86/x64, there are no percpu actions to take. - */ -void hv_stimer0_vector_handler(struct pt_regs *regs); -void hv_stimer0_callback_vector(void); - static inline void hv_enable_stimer0_percpu_irq(int irq) {} static inline void hv_disable_stimer0_percpu_irq(int irq) {} @@ -224,7 +214,6 @@ void hyperv_setup_mmu_ops(void); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); void hv_free_hyperv_page(unsigned long addr); -void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..e8370e64a155 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -128,6 +128,10 @@ #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ #define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ +/* SRBDS support */ +#define MSR_IA32_MCU_OPT_CTRL 0x00000123 +#define RNGDS_MITG_DIS BIT(0) + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 @@ -301,6 +305,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 + /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index b809f117f3f4..73d997aa2966 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -20,8 +20,10 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) -#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 +#define TPAUSE_C01_STATE 1 +#define TPAUSE_C02_STATE 0 u32 get_umwait_control_msr(void); @@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __tpause(u32 ecx, u32 edx, u32 eax) +{ + /* "tpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_TPAUSE + asm volatile("tpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 07e95dcb40ad..e7752b4038ff 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,20 +4,13 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include <linux/static_key.h> +#include <linux/frame.h> #include <asm/alternative.h> #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> - -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - ANNOTATE_IGNORE_ALTERNATIVE +#include <asm/unwind_hints.h> /* * Fill the CPU return stack buffer. @@ -46,21 +39,25 @@ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; + jnz 771b; #ifdef __ASSEMBLY__ @@ -77,57 +74,27 @@ .endm /* - * These are the bare retpoline primitives for indirect jmp and call. - * Do not use these directly; they only exist to make the ALTERNATIVE - * invocation below less ugly. - */ -.macro RETPOLINE_JMP reg:req - call .Ldo_rop_\@ -.Lspec_trap_\@: - pause - lfence - jmp .Lspec_trap_\@ -.Ldo_rop_\@: - mov \reg, (%_ASM_SP) - ret -.endm - -/* - * This is a wrapper around RETPOLINE_JMP so the called function in reg - * returns to the instruction after the macro. - */ -.macro RETPOLINE_CALL reg:req - jmp .Ldo_call_\@ -.Ldo_retpoline_jmp_\@: - RETPOLINE_JMP \reg -.Ldo_call_\@: - call .Ldo_retpoline_jmp_\@ -.endm - -/* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ - __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else - jmp *\reg + jmp *%\reg #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ - __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ + __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else - call *\reg + call *%\reg #endif .endm @@ -137,10 +104,8 @@ */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) .Lskip_rsb_\@: #endif .endm @@ -161,16 +126,16 @@ * which is ensured when CONFIG_RETPOLINE is defined. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ + "call __x86_retpoline_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_AMD) + # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -180,7 +145,6 @@ * here, anyway. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ @@ -237,27 +201,6 @@ enum ssb_mitigation { extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; -/* - * On VMEXIT we must ensure that no RSB predictions learned in the guest - * can be followed in the host, by overwriting the RSB completely. Both - * retpoline and IBRS mitigations for Spectre v2 need this; only on future - * CPUs with IBRS_ALL *might* it be avoided. - */ -static inline void vmexit_fill_RSB(void) -{ -#ifdef CONFIG_RETPOLINE - unsigned long loops; - - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) - "910:" - : "=r" (loops), ASM_CALL_CONSTRAINT - : : "memory" ); -#endif -} - static __always_inline void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) { @@ -319,7 +262,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ -static inline void mds_clear_cpu_buffers(void) +static __always_inline void mds_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; @@ -340,7 +283,7 @@ static inline void mds_clear_cpu_buffers(void) * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_user_clear_cpu_buffers(void) +static __always_inline void mds_user_clear_cpu_buffers(void) { if (static_branch_likely(&mds_user_clear)) mds_clear_cpu_buffers(); diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 6e060907c163..d25534940bde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -58,8 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 694d8daf4983..5ca5d297df75 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -47,7 +47,13 @@ static inline void slow_down_io(void) #endif } -static inline void __flush_tlb(void) +void native_flush_tlb_local(void); +void native_flush_tlb_global(void); +void native_flush_tlb_one_user(unsigned long addr); +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + +static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } @@ -62,8 +68,8 @@ static inline void __flush_tlb_one_user(unsigned long addr) PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } -static inline void flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +static inline void __flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info); } diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 6deb6cd236e3..7f6ccff0ba72 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -20,6 +20,8 @@ typedef union { #define SHARED_KERNEL_PMD 0 +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED + /* * traditional i386 two-level paging structure: */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 5afb5e0fe903..e896ebef8c24 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -39,23 +39,23 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with * a "*pmdp" dereference done by GCC. Problem is, in certain places * where pte_offset_map_lock() is called, concurrent page faults are - * allowed, if the mmap_sem is hold for reading. An example is mincore + * allowed, if the mmap_lock is hold for reading. An example is mincore * vs page faults vs MADV_DONTNEED. On the page fault side * pmd_populate() rightfully does a set_64bit(), but if we're reading the * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen * because GCC will not read the 64-bit value of the pmd atomically. * * To fix this all places running pte_offset_map_lock() while holding the - * mmap_sem in read mode, shall read the pmdp pointer using this + * mmap_lock in read mode, shall read the pmdp pointer using this * function to know if the pmd is null or not, and in turn to know if * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd * operations. * - * Without THP if the mmap_sem is held for reading, the pmd can only + * Without THP if the mmap_lock is held for reading, the pmd can only * transition from null to not null while pmd_read_atomic() runs. So * we can always return atomic pmd values with this function. * - * With THP if the mmap_sem is held for reading, the pmd can become + * With THP if the mmap_lock is held for reading, the pmd can become * trans_huge or none or point to a pte (and in turn become "stable") * at any time under pmd_read_atomic(). We could read it truly * atomically here with an atomic64_read() for the THP enabled case (and diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 33845d36897c..80fbb4a9ed87 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,6 +27,8 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif +#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4d02e64af1b3..76aa21e8128d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -257,6 +257,7 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; @@ -624,7 +625,7 @@ static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) return __pud(pfn | check_pgprot(pgprot)); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); @@ -801,7 +802,7 @@ static inline int pmd_present(pmd_t pmd) #ifdef CONFIG_NUMA_BALANCING /* * These work without NUMA balancing but the kernel does not care. See the - * comment in include/asm-generic/pgtable.h + * comment in include/linux/pgtable.h */ static inline int pte_protnone(pte_t pte) { @@ -836,17 +837,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* - * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] - * - * this macro returns the index of the entry in the pmd page which would - * control the given virtual address - */ -static inline unsigned long pmd_index(unsigned long address) -{ - return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); -} - -/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * @@ -855,25 +845,6 @@ static inline unsigned long pmd_index(unsigned long address) */ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -/* - * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] - * - * this function returns the index of the entry in the pte page which would - * control the given virtual address - * - * Also define macro so we can test if pte_index is defined for arch. - */ -#define pte_index pte_index -static inline unsigned long pte_index(unsigned long address) -{ - return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); -} - -static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) -{ - return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); -} - static inline int pmd_bad(pmd_t pmd) { return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; @@ -906,12 +877,6 @@ static inline unsigned long pud_page_vaddr(pud_t pud) */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) -/* Find an entry in the second-level page table.. */ -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); -} - #define pud_leaf pud_large static inline int pud_large(pud_t pud) { @@ -931,11 +896,6 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - #if CONFIG_PGTABLE_LEVELS > 3 static inline int p4d_none(p4d_t p4d) { @@ -958,12 +918,6 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) */ #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) -/* Find an entry in the third-level page table.. */ -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) -{ - return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); -} - static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; @@ -1036,30 +990,6 @@ static inline int pgd_none(pgd_t pgd) #endif /* __ASSEMBLY__ */ -/* - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] - * - * this macro returns the index of the entry in the pgd page which would - * control the given virtual address - */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) - -/* - * pgd_offset() returns a (pgd_t *) - * pgd_index() is used get the offset into the pgd page's array of pgd_t's; - */ -#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) -/* - * a shortcut to get a pgd_t in a given mm - */ -#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) -/* - * a shortcut which implies the use of the kernel's pgd, instead - * of a process's - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) - - #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) @@ -1070,27 +1000,14 @@ void init_mem_mapping(void); void early_alloc_pgt_buf(void); extern void memblock_find_dma_reserve(void); + #ifdef CONFIG_X86_64 -/* Realmode trampoline initialization. */ extern pgd_t trampoline_pgd_entry; -static inline void __meminit init_trampoline_default(void) -{ - /* Default trampoline pgd value */ - trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; -} void __init poking_init(void); unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); - -# ifdef CONFIG_RANDOMIZE_MEMORY -void __meminit init_trampoline(void); -# else -# define init_trampoline init_trampoline_default -# endif -#else -static inline void init_trampoline(void) { } #endif /* local pte updates need not use xchg for locking */ @@ -1545,7 +1462,6 @@ static inline bool arch_faults_on_old_pte(void) return false; } -#include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0dca7f7aeff2..d7acae4120d5 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -32,42 +32,23 @@ extern pmd_t initial_pg_pmd[]; void paging_init(void); void sync_initial_page_table(void); -/* - * Define this if things work differently on an i386 and an i486: - * it will (on an i486) warn about kernel memory accesses that are - * done without a 'access_ok( ..)' - */ -#undef TEST_ACCESS_OK - #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> #else # include <asm/pgtable-2level.h> #endif -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte)) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) -#define pte_unmap(pte) do { } while (0) -#endif - /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one_kernel((vaddr)); \ + flush_tlb_one_kernel((vaddr)); \ } while (0) #endif /* !__ASSEMBLY__ */ /* - * kern_addr_valid() is (1) for FLATMEM and (0) for - * SPARSEMEM and DISCONTIGMEM + * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM */ #ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index df1373415f11..1b68d24dc6a0 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { } struct mm_struct; +#define mm_p4d_folded mm_p4d_folded +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} + void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); @@ -180,10 +186,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); /* PTE - Level 1 access. */ -/* x86-64 always has all page tables mapped. */ -#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) ((void)(pte))/* NOP */ - /* * Encode and de-code a swap entry * diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..8f63efb2a2cc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b6606fe6cfdf..2da1f95b88d7 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,7 +194,6 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -220,7 +219,6 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) -#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) @@ -284,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | _PAGE_NX); +} +#define pgprot_nx pgprot_nx + #ifdef CONFIG_X86_PAE /* @@ -467,9 +471,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; -extern uint8_t __pte2cachemode_tbl[8]; - #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ @@ -479,43 +480,26 @@ extern uint8_t __pte2cachemode_tbl[8]; (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) -static inline unsigned long cachemode2protval(enum page_cache_mode pcm) -{ - if (likely(pcm == 0)) - return 0; - return __cachemode2pte_tbl[pcm]; -} -static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +unsigned long cachemode2protval(enum page_cache_mode pcm); + +static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { - return __pgprot(cachemode2protval(pcm)); + return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } -static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { - unsigned long masked; - - masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; - if (likely(masked == 0)) - return 0; - return __pte2cachemode_tbl[__pte2cm_idx(masked)]; + return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } -static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { - pgprotval_t val = pgprot_val(pgprot); - pgprot_t new; - - pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | - ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); - return new; + return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT_LARGE) >> + (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { - pgprotval_t val = pgprot_val(pgprot); - pgprot_t new; - - pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | - ((val & _PAGE_PAT_LARGE) >> - (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); - return new; + return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bcf27caf6c9..42cd333616c4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -113,9 +113,10 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ - /* Cache QoS architectural values: */ + /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ + int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ @@ -727,7 +728,6 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( - UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -737,7 +737,6 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" - UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif @@ -824,7 +823,7 @@ static inline void prefetch(const void *x) * Useful for spinlocks to avoid one state transition in the * cache coherency protocol: */ -static inline void prefetchw(const void *x) +static __always_inline void prefetchw(const void *x) { alternative_input(BASE_PREFETCH, "prefetchw %P1", X86_FEATURE_3DNOWPREFETCH, diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6d6475fdd327..ebedeab48704 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -123,7 +123,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) * On x86_64, vm86 mode is mercifully nonexistent, and we don't need * the extra check. */ -static inline int user_mode(struct pt_regs *regs) +static __always_inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl.h index f6b7fe2833cc..07603064df8f 100644 --- a/arch/x86/include/asm/resctrl_sched.h +++ b/arch/x86/include/asm/resctrl.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_RESCTRL_SCHED_H -#define _ASM_X86_RESCTRL_SCHED_H +#ifndef _ASM_X86_RESCTRL_H +#define _ASM_X86_RESCTRL_H #ifdef CONFIG_X86_CPU_RESCTRL @@ -84,10 +84,13 @@ static inline void resctrl_sched_in(void) __resctrl_sched_in(); } +void resctrl_cpu_detect(struct cpuinfo_x86 *c); + #else static inline void resctrl_sched_in(void) {} +static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ -#endif /* _ASM_X86_RESCTRL_SCHED_H */ +#endif /* _ASM_X86_RESCTRL_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ed8ec011a9fd..84b645cc8bc9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -75,7 +75,17 @@ extern char _text[]; static inline bool kaslr_enabled(void) { - return !!(boot_params.hdr.loadflags & KASLR_FLAG); + return IS_ENABLED(CONFIG_RANDOMIZE_MEMORY) && + !!(boot_params.hdr.loadflags & KASLR_FLAG); +} + +/* + * Apply no randomization if KASLR was disabled at boot or if KASAN + * is enabled. KASAN shadow mappings rely on regions being PGD aligned. + */ +static inline bool kaslr_memory_enabled(void) +{ + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } static inline unsigned long kaslr_offset(void) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 27c47d183f4b..8b58d6975d5d 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -57,8 +57,10 @@ static __always_inline unsigned long smap_save(void) { unsigned long flags; - asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, - X86_FEATURE_SMAP) + asm volatile ("# smap_save\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "pushf; pop %0; " __ASM_CLAC "\n\t" + "1:" : "=rm" (flags) : : "memory", "cc"); return flags; @@ -66,7 +68,10 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { - asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + asm volatile ("# smap_restore\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "push %0; popf\n\t" + "1:" : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6d37b8fcfc77..eb8e781c4353 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -7,6 +7,7 @@ #include <asm/nops.h> #include <asm/processor-flags.h> +#include <linux/irqflags.h> #include <linux/jump_label.h> /* @@ -27,14 +28,14 @@ static inline unsigned long native_read_cr0(void) return val; } -static inline unsigned long native_read_cr2(void) +static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); return val; } -static inline void native_write_cr2(unsigned long val) +static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); } @@ -129,7 +130,16 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern asmlinkage void native_load_gs_index(unsigned); +extern asmlinkage void asm_load_gs_index(unsigned int selector); + +static inline void native_load_gs_index(unsigned int selector) +{ + unsigned long flags; + + local_irq_save(flags); + asm_load_gs_index(selector); + local_irq_restore(flags); +} static inline unsigned long __read_cr4(void) { @@ -150,12 +160,12 @@ static inline void write_cr0(unsigned long x) native_write_cr0(x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } @@ -186,7 +196,7 @@ static inline void wbinvd(void) #ifdef CONFIG_X86_64 -static inline void load_gs_index(unsigned selector) +static inline void load_gs_index(unsigned int selector) { native_load_gs_index(selector); } diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index bf3e34b25afc..323db6c5852a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -3,29 +3,7 @@ #define _ASM_X86_SPINLOCK_TYPES_H #include <linux/types.h> - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define __TICKET_LOCK_INC 2 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) -#else -#define __TICKET_LOCK_INC 1 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) -#endif - -#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) -typedef u8 __ticket_t; -typedef u16 __ticketpair_t; -#else -typedef u16 __ticket_t; -typedef u32 __ticketpair_t; -#endif - -#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) - -#define TICKET_SHIFT (sizeof(__ticket_t) * 8) - #include <asm-generic/qspinlock_types.h> - #include <asm-generic/qrwlock_types.h> #endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 91e29b6a86a5..9804a7957f4e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -55,8 +55,13 @@ /* * Initialize the stackprotector canary value. * - * NOTE: this must only be called from functions that never return, + * NOTE: this must only be called from functions that never return * and it must always be inlined. + * + * In addition, it should be called from a compilation unit for which + * stack protector is disabled. Alternatively, the caller should not end + * with a function call which gets tail-call optimized as that would + * lead to checking a modified canary value. */ static __always_inline void boot_init_stack_canary(void) { diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 14db05086bbf..5ae5a68e469d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -87,7 +87,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, const char *log_lvl); /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6ece8561ba66..8a1f5382a4ea 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -96,7 +96,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; }; @@ -203,8 +202,16 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 last_excp_to; }; + +static inline void __unused_size_checks(void) +{ + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); +} + struct __attribute__ ((__packed__)) vmcb { struct vmcb_control_area control; + u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; }; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 0e059b73437b..9f69cc497f4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -/* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *next) -{ -#ifdef CONFIG_VMAP_STACK - /* - * If we switch to a stack that has a top-level paging entry - * that is not present in the current mm, the resulting #PF will - * will be promoted to a double-fault and we'll panic. Probe - * the new stack now so that vmalloc_fault can fix up the page - * tables if needed. This can only happen if we use a stack - * in vmap space. - * - * We assume that the stack is aligned so that it never spans - * more than one top-level paging entry. - * - * To minimize cache pollution, just follow the stack pointer. - */ - READ_ONCE(*(unsigned char *)next->thread.sp); -#endif -} - asmlinkage void ret_from_fork(void); /* @@ -67,8 +46,6 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(next); \ - \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 67315fa3956a..6593b42cb379 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -64,7 +64,7 @@ extern void text_poke_finish(void); #define DISP32_SIZE 4 -static inline int text_opcode_size(u8 opcode) +static __always_inline int text_opcode_size(u8 opcode) { int size = 0; @@ -118,12 +118,14 @@ extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 -static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) +static __always_inline +void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } -static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) +static __always_inline +void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the @@ -138,7 +140,8 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) *(unsigned long *)regs->sp = val; } -static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) +static __always_inline +void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6f66d841262d..8c87a2e0b660 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,140 +13,51 @@ #include <asm/pti.h> #include <asm/processor-flags.h> -/* - * The x86 feature is called PCID (Process Context IDentifier). It is similar - * to what is traditionally called ASID on the RISC processors. - * - * We don't use the traditional ASID implementation, where each process/mm gets - * its own ASID and flush/restart when we run out of ASID space. - * - * Instead we have a small per-cpu array of ASIDs and cache the last few mm's - * that came by on this CPU, allowing cheaper switch_mm between processes on - * this CPU. - * - * We end up with different spaces for different things. To avoid confusion we - * use different names for each of them: - * - * ASID - [0, TLB_NR_DYN_ASIDS-1] - * the canonical identifier for an mm - * - * kPCID - [1, TLB_NR_DYN_ASIDS] - * the value we write into the PCID part of CR3; corresponds to the - * ASID+1, because PCID 0 is special. - * - * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] - * for KPTI each mm has two address spaces and thus needs two - * PCID values, but we can still do with a single ASID denomination - * for each mm. Corresponds to kPCID + 2048. - * - */ - -/* There are 12 bits of space for ASIDS in CR3 */ -#define CR3_HW_ASID_BITS 12 - -/* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for - * user/kernel switches - */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define PTI_CONSUMED_PCID_BITS 1 -#else -# define PTI_CONSUMED_PCID_BITS 0 -#endif +void __flush_tlb_all(void); -#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) +#define TLB_FLUSH_ALL -1UL -/* - * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account - * for them being zero-based. Another -1 is because PCID 0 is reserved for - * use by non-PCID-aware users. - */ -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) +void cr4_update_irqsoff(unsigned long set, unsigned long clear); +unsigned long cr4_read_shadow(void); -/* - * 6 because 6 should be plenty and struct tlb_state will fit in two cache - * lines. - */ -#define TLB_NR_DYN_ASIDS 6 - -/* - * Given @asid, compute kPCID - */ -static inline u16 kern_pcid(u16 asid) +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits_irqsoff(unsigned long mask) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - -#ifdef CONFIG_PAGE_TABLE_ISOLATION - /* - * Make sure that the dynamic ASID space does not confict with the - * bit we are using to switch between user and kernel ASIDs. - */ - BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); - - /* - * The ASID being passed in here should have respected the - * MAX_ASID_AVAILABLE and thus never have the switch bit set. - */ - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); -#endif - /* - * The dynamically-assigned ASIDs that get passed in are small - * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, - * so do not bother to clear it. - * - * If PCID is on, ASID-aware code paths put the ASID+1 into the - * PCID bits. This serves two purposes. It prevents a nasty - * situation in which PCID-unaware code saves CR3, loads some other - * value (with PCID == 0), and then restores CR3, thus corrupting - * the TLB for ASID 0 if the saved ASID was nonzero. It also means - * that any bugs involving loading a PCID-enabled CR3 with - * CR4.PCIDE off will trigger deterministically. - */ - return asid + 1; + cr4_update_irqsoff(mask, 0); } -/* - * Given @asid, compute uPCID - */ -static inline u16 user_pcid(u16 asid) +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits_irqsoff(unsigned long mask) { - u16 ret = kern_pcid(asid); -#ifdef CONFIG_PAGE_TABLE_ISOLATION - ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; -#endif - return ret; + cr4_update_irqsoff(0, mask); } -struct pgd_t; -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) { - if (static_cpu_has(X86_FEATURE_PCID)) { - return __sme_pa(pgd) | kern_pcid(asid); - } else { - VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(pgd); - } + unsigned long flags; + + local_irq_save(flags); + cr4_set_bits_irqsoff(mask); + local_irq_restore(flags); } -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - /* - * Use boot_cpu_has() instead of this_cpu_has() as this function - * might be called during early boot. This should work even after - * boot because all CPU's the have same capabilities: - */ - VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + unsigned long flags; + + local_irq_save(flags); + cr4_clear_bits_irqsoff(mask); + local_irq_restore(flags); } -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#else -#define __flush_tlb() __native_flush_tlb() -#define __flush_tlb_global() __native_flush_tlb_global() -#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) -#endif +#ifndef MODULE +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 struct tlb_context { u64 ctx_id; @@ -242,38 +153,7 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); -/* - * Blindly accessing user memory from NMI context can be dangerous - * if we're in the middle of switching the current user task or - * switching the loaded mm. It can also be dangerous if we - * interrupted some kernel code that was temporarily using a - * different mm. - */ -static inline bool nmi_uaccess_okay(void) -{ - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); - struct mm_struct *current_mm = current->mm; - - VM_WARN_ON_ONCE(!loaded_mm); - - /* - * The condition we want to check is - * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, - * if we're running in a VM with shadow paging, and nmi_uaccess_okay() - * is supposed to be reasonably fast. - * - * Instead, we check the almost equivalent but somewhat conservative - * condition below, and we rely on the fact that switch_mm_irqs_off() - * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. - */ - if (loaded_mm != current_mm) - return false; - - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); - - return true; -} - +bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay /* Initialize cr4 shadow for this CPU. */ @@ -282,250 +162,12 @@ static inline void cr4_init_shadow(void) this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } -static inline void __cr4_set(unsigned long cr4) -{ - lockdep_assert_irqs_disabled(); - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); -} - -/* Set in this cpu's CR4. */ -static inline void cr4_set_bits_irqsoff(unsigned long mask) -{ - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 | mask) != cr4) - __cr4_set(cr4 | mask); -} - -/* Clear in this cpu's CR4. */ -static inline void cr4_clear_bits_irqsoff(unsigned long mask) -{ - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 & ~mask) != cr4) - __cr4_set(cr4 & ~mask); -} - -/* Set in this cpu's CR4. */ -static inline void cr4_set_bits(unsigned long mask) -{ - unsigned long flags; - - local_irq_save(flags); - cr4_set_bits_irqsoff(mask); - local_irq_restore(flags); -} - -/* Clear in this cpu's CR4. */ -static inline void cr4_clear_bits(unsigned long mask) -{ - unsigned long flags; - - local_irq_save(flags); - cr4_clear_bits_irqsoff(mask); - local_irq_restore(flags); -} - -static inline void cr4_toggle_bits_irqsoff(unsigned long mask) -{ - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - __cr4_set(cr4 ^ mask); -} - -/* Read the CR4 shadow. */ -static inline unsigned long cr4_read_shadow(void) -{ - return this_cpu_read(cpu_tlbstate.cr4); -} - -/* - * Mark all other ASIDs as invalid, preserves the current. - */ -static inline void invalidate_other_asid(void) -{ - this_cpu_write(cpu_tlbstate.invalidate_other, true); -} - -/* - * Save some of cr4 feature set we're using (e.g. Pentium 4MB - * enable and PPro Global page enable), so that any CPU's that boot - * up after us can get the correct flags. This should only be used - * during boot on the boot cpu. - */ extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; -static inline void cr4_set_bits_and_update_boot(unsigned long mask) -{ - mmu_cr4_features |= mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4_set_bits(mask); -} - extern void initialize_tlbstate_and_flush(void); /* - * Given an ASID, flush the corresponding user ASID. We can delay this - * until the next time we switch to it. - * - * See SWITCH_TO_USER_CR3. - */ -static inline void invalidate_user_asid(u16 asid) -{ - /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) - return; - - /* - * We only have a single ASID if PCID is off and the CR3 - * write will have flushed it. - */ - if (!cpu_feature_enabled(X86_FEATURE_PCID)) - return; - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - __set_bit(kern_pcid(asid), - (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); -} - -/* - * flush the entire current user mapping - */ -static inline void __native_flush_tlb(void) -{ - /* - * Preemption or interrupts must be disabled to protect the access - * to the per CPU variable and to prevent being preempted between - * read_cr3() and write_cr3(). - */ - WARN_ON_ONCE(preemptible()); - - invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* If current->mm == NULL then the read_cr3() "borrows" an mm */ - native_write_cr3(__native_read_cr3()); -} - -/* - * flush everything - */ -static inline void __native_flush_tlb_global(void) -{ - unsigned long cr4, flags; - - if (static_cpu_has(X86_FEATURE_INVPCID)) { - /* - * Using INVPCID is considerably faster than a pair of writes - * to CR4 sandwiched inside an IRQ flag save/restore. - * - * Note, this works with CR4.PCIDE=0 or 1. - */ - invpcid_flush_all(); - return; - } - - /* - * Read-modify-write to CR4 - protect it from preemption and - * from interrupts. (Use the raw variant because this code can - * be called from deep inside debugging code.) - */ - raw_local_irq_save(flags); - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); - - raw_local_irq_restore(flags); -} - -/* - * flush one page in the user mapping - */ -static inline void __native_flush_tlb_one_user(unsigned long addr) -{ - u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - /* - * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. - * Just use invalidate_user_asid() in case we are called early. - */ - if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) - invalidate_user_asid(loaded_mm_asid); - else - invpcid_flush_one(user_pcid(loaded_mm_asid), addr); -} - -/* - * flush everything - */ -static inline void __flush_tlb_all(void) -{ - /* - * This is to catch users with enabled preemption and the PGE feature - * and don't trigger the warning in __native_flush_tlb(). - */ - VM_WARN_ON_ONCE(preemptible()); - - if (boot_cpu_has(X86_FEATURE_PGE)) { - __flush_tlb_global(); - } else { - /* - * !PGE -> !PCID (setup_pcid()), thus every flush is total. - */ - __flush_tlb(); - } -} - -/* - * flush one page in the kernel mapping - */ -static inline void __flush_tlb_one_kernel(unsigned long addr) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - - /* - * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its - * paravirt equivalent. Even with PCID, this is sufficient: we only - * use PCID if we also use global PTEs for the kernel mapping, and - * INVLPG flushes global translations across all address spaces. - * - * If PTI is on, then the kernel is mapped with non-global PTEs, and - * __flush_tlb_one_user() will flush the given address for the current - * kernel address space and for its usermode counterpart, but it does - * not flush it for other address spaces. - */ - __flush_tlb_one_user(addr); - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - /* - * See above. We need to propagate the flush to all other address - * spaces. In principle, we only need to propagate it to kernelmode - * address spaces, but the extra bookkeeping we would need is not - * worth it. - */ - invalidate_other_asid(); -} - -#define TLB_FLUSH_ALL -1UL - -/* * TLB flushing: * * - flush_tlb_all() flushes all processes TLBs @@ -563,7 +205,15 @@ struct flush_tlb_info { bool freed_tables; }; -#define local_flush_tlb() __flush_tlb() +void flush_tlb_local(void); +void flush_tlb_one_user(unsigned long addr); +void flush_tlb_one_kernel(unsigned long addr); +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) @@ -585,9 +235,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* @@ -608,12 +255,6 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); -#ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, info) \ - native_flush_tlb_others(mask, info) - -#define paravirt_tlb_remove_table(tlb, page) \ - tlb_remove_page(tlb, (void *)(page)) -#endif +#endif /* !MODULE */ #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h index 57c8da027d99..f0f9bcdb74d9 100644 --- a/arch/x86/include/asm/trace/common.h +++ b/arch/x86/include/asm/trace/common.h @@ -5,12 +5,8 @@ DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); #define trace_pagefault_enabled() \ static_branch_unlikely(&trace_pagefault_key) -DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); -#define trace_resched_ipi_enabled() \ - static_branch_unlikely(&trace_resched_ipi_key) #else static inline bool trace_pagefault_enabled(void) { return false; } -static inline bool trace_resched_ipi_enabled(void) { return false; } #endif #endif diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 33b9d0f0aafe..88e7f0f3bf62 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -10,9 +10,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern int trace_resched_ipi_reg(void); -extern void trace_resched_ipi_unreg(void); - DECLARE_EVENT_CLASS(x86_irq_vector, TP_PROTO(int vector), @@ -37,18 +34,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), NULL, NULL); -#define DEFINE_RESCHED_IPI_EVENT(name) \ -DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ - TP_PROTO(int vector), \ - TP_ARGS(vector), \ - trace_resched_ipi_reg, \ - trace_resched_ipi_unreg); \ -DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ - TP_PROTO(int vector), \ - TP_ARGS(vector), \ - trace_resched_ipi_reg, \ - trace_resched_ipi_unreg); - /* * local_timer - called when entering/exiting a local timer interrupt * vector handler @@ -99,7 +84,7 @@ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); /* * reschedule - called when entering/exiting a reschedule vector handler */ -DEFINE_RESCHED_IPI_EVENT(reschedule); +DEFINE_IRQ_VECTOR_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h new file mode 100644 index 000000000000..082f45631fa9 --- /dev/null +++ b/arch/x86/include/asm/trapnr.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_TRAPNR_H +#define _ASM_X86_TRAPNR_H + +/* Interrupts/Exceptions */ + +#define X86_TRAP_DE 0 /* Divide-by-zero */ +#define X86_TRAP_DB 1 /* Debug */ +#define X86_TRAP_NMI 2 /* Non-maskable Interrupt */ +#define X86_TRAP_BP 3 /* Breakpoint */ +#define X86_TRAP_OF 4 /* Overflow */ +#define X86_TRAP_BR 5 /* Bound Range Exceeded */ +#define X86_TRAP_UD 6 /* Invalid Opcode */ +#define X86_TRAP_NM 7 /* Device Not Available */ +#define X86_TRAP_DF 8 /* Double Fault */ +#define X86_TRAP_OLD_MF 9 /* Coprocessor Segment Overrun */ +#define X86_TRAP_TS 10 /* Invalid TSS */ +#define X86_TRAP_NP 11 /* Segment Not Present */ +#define X86_TRAP_SS 12 /* Stack Segment Fault */ +#define X86_TRAP_GP 13 /* General Protection Fault */ +#define X86_TRAP_PF 14 /* Page Fault */ +#define X86_TRAP_SPURIOUS 15 /* Spurious Interrupt */ +#define X86_TRAP_MF 16 /* x87 Floating-Point Exception */ +#define X86_TRAP_AC 17 /* Alignment Check */ +#define X86_TRAP_MC 18 /* Machine Check */ +#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */ +#define X86_TRAP_VE 20 /* Virtualization Exception */ +#define X86_TRAP_CP 21 /* Control Protection Exception */ +#define X86_TRAP_IRET 32 /* IRET Exception */ + +#endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..714b1a30e7b0 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,87 +6,9 @@ #include <linux/kprobes.h> #include <asm/debugreg.h> +#include <asm/idtentry.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ -#define dotraplinkage __visible - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -#ifdef CONFIG_X86_64 -asmlinkage void double_fault(void); -#endif -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void async_page_fault(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void coprocessor_error(void); -asmlinkage void alignment_check(void); -#ifdef CONFIG_X86_MCE -asmlinkage void machine_check(void); -#endif /* CONFIG_X86_MCE */ -asmlinkage void simd_coprocessor_error(void); - -#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -asmlinkage void xen_divide_error(void); -asmlinkage void xen_xennmi(void); -asmlinkage void xen_xendebug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_overflow(void); -asmlinkage void xen_bounds(void); -asmlinkage void xen_invalid_op(void); -asmlinkage void xen_device_not_available(void); -asmlinkage void xen_double_fault(void); -asmlinkage void xen_coprocessor_segment_overrun(void); -asmlinkage void xen_invalid_TSS(void); -asmlinkage void xen_segment_not_present(void); -asmlinkage void xen_stack_segment(void); -asmlinkage void xen_general_protection(void); -asmlinkage void xen_page_fault(void); -asmlinkage void xen_spurious_interrupt_bug(void); -asmlinkage void xen_coprocessor_error(void); -asmlinkage void xen_alignment_check(void); -#ifdef CONFIG_X86_MCE -asmlinkage void xen_machine_check(void); -#endif /* CONFIG_X86_MCE */ -asmlinkage void xen_simd_coprocessor_error(void); -#endif - -dotraplinkage void do_divide_error(struct pt_regs *regs, long error_code); -dotraplinkage void do_debug(struct pt_regs *regs, long error_code); -dotraplinkage void do_nmi(struct pt_regs *regs, long error_code); -dotraplinkage void do_int3(struct pt_regs *regs, long error_code); -dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); -dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); -dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -#endif -dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); -dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); -dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); -dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); -dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); -dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); -dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code); -#ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code); -#endif -dotraplinkage void do_mce(struct pt_regs *regs, long error_code); - #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace @@ -94,6 +16,11 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); #endif +#ifdef CONFIG_X86_F00F_BUG +/* For handling the FOOF bug */ +void handle_invalid_op(struct pt_regs *regs); +#endif + static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) @@ -107,21 +34,6 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); -#ifndef CONFIG_X86_32 -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs); -asmlinkage void smp_threshold_interrupt(struct pt_regs *regs); -asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); -#endif - -void smp_apic_timer_interrupt(struct pt_regs *regs); -void smp_spurious_interrupt(struct pt_regs *regs); -void smp_error_interrupt(struct pt_regs *regs); -asmlinkage void smp_irq_move_cleanup_interrupt(void); - -extern void ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs); -extern void ist_begin_non_atomic(struct pt_regs *regs); -extern void ist_end_non_atomic(void); #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, @@ -129,31 +41,6 @@ void __noreturn handle_stack_overflow(const char *message, unsigned long fault_address); #endif -/* Interrupts/Exceptions */ -enum { - X86_TRAP_DE = 0, /* 0, Divide-by-zero */ - X86_TRAP_DB, /* 1, Debug */ - X86_TRAP_NMI, /* 2, Non-maskable Interrupt */ - X86_TRAP_BP, /* 3, Breakpoint */ - X86_TRAP_OF, /* 4, Overflow */ - X86_TRAP_BR, /* 5, Bound Range Exceeded */ - X86_TRAP_UD, /* 6, Invalid Opcode */ - X86_TRAP_NM, /* 7, Device Not Available */ - X86_TRAP_DF, /* 8, Double Fault */ - X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */ - X86_TRAP_TS, /* 10, Invalid TSS */ - X86_TRAP_NP, /* 11, Segment Not Present */ - X86_TRAP_SS, /* 12, Stack Segment Fault */ - X86_TRAP_GP, /* 13, General Protection Fault */ - X86_TRAP_PF, /* 14, Page Fault */ - X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */ - X86_TRAP_MF, /* 16, x87 Floating-Point Exception */ - X86_TRAP_AC, /* 17, Alignment Check */ - X86_TRAP_MC, /* 18, Machine Check */ - X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */ - X86_TRAP_IRET = 32, /* 32, IRET Exception */ -}; - /* * Page fault error code bits: * diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index d8f283b9a569..18dfa07d3ef0 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -504,12 +504,12 @@ do { \ * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ -#define unsafe_copy_loop(dst, src, len, type, label) \ - while (len >= sizeof(type)) { \ - unsafe_put_user(*(type *)src,(type __user *)dst,label); \ - dst += sizeof(type); \ - src += sizeof(type); \ - len -= sizeof(type); \ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ } #define unsafe_copy_to_user(_dst,_src,_len,label) \ @@ -523,5 +523,21 @@ do { \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __kr_err; \ + \ + __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ + sizeof(type), __kr_err); \ + if (unlikely(__kr_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ + __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ + sizeof(type), err_label) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 499578f7e6d7..70fc159ebe69 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -19,7 +19,7 @@ struct unwind_state { #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; - struct pt_regs *regs; + struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index f5e2eb12cb71..7d903fdb3f43 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,32 +86,15 @@ UNWIND_HINT sp_offset=\sp_offset .endm -.macro UNWIND_HINT_SAVE - UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE -.endm - -.macro UNWIND_HINT_RESTORE - UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +/* + * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN + * and sibling calls. On these, sp_offset denotes the expected offset from + * initial_func_cfi. + */ +.macro UNWIND_HINT_RET_OFFSET sp_offset=8 + UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm -#else /* !__ASSEMBLY__ */ - -#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ - "987: \n\t" \ - ".pushsection .discard.unwind_hints\n\t" \ - /* struct unwind_hint */ \ - ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ - ".byte " __stringify(sp_reg) "\n\t" \ - ".byte " __stringify(type) "\n\t" \ - ".byte " __stringify(end) "\n\t" \ - ".balign 4 \n\t" \ - ".popsection\n\t" - -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) - -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 389174eaec79..2fcc3ac12e76 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -123,12 +123,6 @@ enum uv_memprotect { UV_MEMPROT_ALLOW_RW }; -/* - * bios calls have 6 parameters - */ -extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); - extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, @@ -146,7 +140,6 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; -#define uv_partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 45ea95ce79b4..3db85626048f 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -8,6 +8,7 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; struct cpumask; struct mm_struct; +struct flush_tlb_info; #ifdef CONFIG_X86_UV #include <linux/efi.h> @@ -31,7 +32,6 @@ static inline bool is_early_uv_system(void) } extern int is_uv_system(void); extern int is_uv_hubbed(int uvtype); -extern int is_uv_hubless(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -44,7 +44,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubbed(int uv) { return 0; } -static inline int is_uv_hubless(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 13687bf0e0a9..f1188bd47658 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -12,6 +12,8 @@ #define _ASM_X86_UV_UV_BAU_H #include <linux/bitmap.h> +#include <asm/idtentry.h> + #define BITSPERBYTE 8 /* @@ -799,12 +801,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) bitmap_zero(&dstp->bits, nbits); } -extern void uv_bau_message_intr1(void); -#ifdef CONFIG_TRACING -#define trace_uv_bau_message_intr1 uv_bau_message_intr1 -#endif -extern void uv_bau_timeout_intr1(void); - struct atomic_short { short counter; }; diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 950cd1395d5d..60ca0afdeaf9 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -219,20 +219,6 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; } -#define UV_HUB_INFO_VERSION 0x7150 -extern int uv_hub_info_version(void); -static inline int uv_hub_info_check(int version) -{ - if (uv_hub_info_version() == version) - return 0; - - pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n", - uv_hub_info_version(), version); - - BUG(); /* Catastrophic - cannot continue on unknown UV system */ -} -#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION) - /* * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in @@ -244,51 +230,32 @@ static inline int uv_hub_info_check(int version) #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -/* WARNING: UVx_HUB_IS_SUPPORTED defines are deprecated and will be removed */ static inline int is_uv1_hub(void) { -#ifdef UV1_HUB_IS_SUPPORTED return is_uv_hubbed(uv(1)); -#else - return 0; -#endif } static inline int is_uv2_hub(void) { -#ifdef UV2_HUB_IS_SUPPORTED return is_uv_hubbed(uv(2)); -#else - return 0; -#endif } static inline int is_uv3_hub(void) { -#ifdef UV3_HUB_IS_SUPPORTED return is_uv_hubbed(uv(3)); -#else - return 0; -#endif } /* First test "is UV4A", then "is UV4" */ static inline int is_uv4a_hub(void) { -#ifdef UV4A_HUB_IS_SUPPORTED if (is_uv_hubbed(uv(4))) return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); -#endif return 0; } static inline int is_uv4_hub(void) { -#ifdef UV4_HUB_IS_SUPPORTED return is_uv_hubbed(uv(4)); -#else - return 0; -#endif } static inline int is_uvx_hub(void) @@ -692,7 +659,6 @@ static inline int uv_cpu_blade_processor_id(int cpu) { return uv_cpu_info_per(cpu)->blade_cpu_id; } -#define _uv_cpu_blade_processor_id 1 /* indicate function available */ /* Blade number to Node number (UV1..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) @@ -856,26 +822,6 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } extern unsigned int uv_apicid_hibits; -static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) -{ - apicid |= uv_apicid_hibits; - return (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); -} - -static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) -{ - unsigned long val; - unsigned long dmode = dest_Fixed; - - if (vector == NMI_VECTOR) - dmode = dest_NMI; - - val = uv_hub_ipi_value(apicid, vector, dmode); - uv_write_global_mmr64(pnode, UVH_IPI_INT, val); -} /* * Get the minimum revision number of the hub chips within the partition. diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 62c79e26a59a..9ee5ed6e8b34 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -99,13 +99,6 @@ #define UV3_HUB_PART_NUMBER_X 0x4321 #define UV4_HUB_PART_NUMBER 0x99a1 -/* Compat: Indicate which UV Hubs are supported. */ -#define UV1_HUB_IS_SUPPORTED 1 -#define UV2_HUB_IS_SUPPORTED 1 -#define UV3_HUB_IS_SUPPORTED 1 -#define UV4_HUB_IS_SUPPORTED 1 -#define UV4A_HUB_IS_SUPPORTED 1 - /* Error function to catch undefined references */ extern unsigned long uv_undefined(char *str); diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h new file mode 100644 index 000000000000..75884d2cdec3 --- /dev/null +++ b/arch/x86/include/asm/vermagic.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#ifdef CONFIG_X86_64 +/* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M486SX +#define MODULE_PROC_FAMILY "486SX " +#elif defined CONFIG_M486 +#define MODULE_PROC_FAMILY "486 " +#elif defined CONFIG_M586 +#define MODULE_PROC_FAMILY "586 " +#elif defined CONFIG_M586TSC +#define MODULE_PROC_FAMILY "586TSC " +#elif defined CONFIG_M586MMX +#define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MATOM +#define MODULE_PROC_FAMILY "ATOM " +#elif defined CONFIG_M686 +#define MODULE_PROC_FAMILY "686 " +#elif defined CONFIG_MPENTIUMII +#define MODULE_PROC_FAMILY "PENTIUMII " +#elif defined CONFIG_MPENTIUMIII +#define MODULE_PROC_FAMILY "PENTIUMIII " +#elif defined CONFIG_MPENTIUMM +#define MODULE_PROC_FAMILY "PENTIUMM " +#elif defined CONFIG_MPENTIUM4 +#define MODULE_PROC_FAMILY "PENTIUM4 " +#elif defined CONFIG_MK6 +#define MODULE_PROC_FAMILY "K6 " +#elif defined CONFIG_MK7 +#define MODULE_PROC_FAMILY "K7 " +#elif defined CONFIG_MK8 +#define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MELAN +#define MODULE_PROC_FAMILY "ELAN " +#elif defined CONFIG_MCRUSOE +#define MODULE_PROC_FAMILY "CRUSOE " +#elif defined CONFIG_MEFFICEON +#define MODULE_PROC_FAMILY "EFFICEON " +#elif defined CONFIG_MWINCHIPC6 +#define MODULE_PROC_FAMILY "WINCHIPC6 " +#elif defined CONFIG_MWINCHIP3D +#define MODULE_PROC_FAMILY "WINCHIP3D " +#elif defined CONFIG_MCYRIXIII +#define MODULE_PROC_FAMILY "CYRIXIII " +#elif defined CONFIG_MVIAC3_2 +#define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " +#elif defined CONFIG_MGEODEGX1 +#define MODULE_PROC_FAMILY "GEODEGX1 " +#elif defined CONFIG_MGEODE_LX +#define MODULE_PROC_FAMILY "GEODE " +#else +#error unknown processor family +#endif + +#ifdef CONFIG_X86_32 +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY +#else +# define MODULE_ARCH_VERMAGIC "" +#endif + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5e090d1f03f8..cd7de4b401fe 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -527,10 +527,12 @@ struct vmx_msr_entry { /* * Exit Qualifications for entry failure during or after loading guest state */ -#define ENTRY_FAIL_DEFAULT 0 -#define ENTRY_FAIL_PDPTE 2 -#define ENTRY_FAIL_NMI 3 -#define ENTRY_FAIL_VMCS_LINK_PTR 4 +enum vm_entry_failure_code { + ENTRY_FAIL_DEFAULT = 0, + ENTRY_FAIL_PDPTE = 2, + ENTRY_FAIL_NMI = 3, + ENTRY_FAIL_VMCS_LINK_PTR = 4, +}; /* * Exit Qualifications for EPT Violations diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 96d9cd208610..6807153c0410 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -50,14 +50,12 @@ struct x86_init_resources { * @pre_vector_init: init code to run before interrupt vectors * are set up. * @intr_init: interrupt init code - * @trap_init: platform specific trap setup * @intr_mode_select: interrupt delivery mode selection * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); - void (*trap_init)(void); void (*intr_mode_select)(void); void (*intr_mode_init)(void); }; diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index d50c7b747d8b..ba4c1b15908b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -38,11 +38,11 @@ #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/pgtable.h> #include <trace/events/xen.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/smap.h> #include <asm/nospec-branch.h> diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 790ce08e41f2..5941e18edd5a 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -11,7 +11,6 @@ #include <asm/extable.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <xen/interface/xen.h> #include <xen/interface/grant_table.h> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 3f3f780c8c65..17c5a038f42d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -385,32 +385,48 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ +#define KVM_STATE_NESTED_FORMAT_SVM 1 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 #define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_MTF_PENDING 0x00000008 +#define KVM_STATE_NESTED_GIF_SET 0x00000100 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 +#define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 + +#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; }; struct kvm_vmx_nested_state_hdr { + __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; + __u64 preemption_timer_deadline; struct { __u16 flags; } smm; }; +struct kvm_svm_nested_state_data { + /* Save area only used if KVM_STATE_NESTED_RUN_PENDING. */ + __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE]; +}; + +struct kvm_svm_nested_state_hdr { + __u64 vmcb_pa; +}; + /* for KVM_CAP_NESTED_STATE */ struct kvm_nested_state { __u16 flags; @@ -419,6 +435,7 @@ struct kvm_nested_state { union { struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; /* Pad the header to 128 bytes. */ __u8 pad[120]; @@ -431,6 +448,7 @@ struct kvm_nested_state { */ union { struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; } data; }; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 2a8e0b6b9805..812e9b4c1114 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -31,6 +31,7 @@ #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 +#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_HINTS_REALTIME 0 @@ -50,6 +51,8 @@ #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 #define MSR_KVM_POLL_CONTROL 0x4b564d05 +#define MSR_KVM_ASYNC_PF_INT 0x4b564d06 +#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 struct kvm_steal_time { __u64 steal; @@ -81,6 +84,11 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) +#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) + +/* MSR_KVM_ASYNC_PF_INT */ +#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) + /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 @@ -112,8 +120,13 @@ struct kvm_mmu_op_release_pt { #define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; + /* Used for 'page not present' events delivered via #PF */ + __u32 flags; + + /* Used for 'page ready' events delivered via interrupt notification */ + __u32 token; + + __u8 pad[56]; __u32 enabled; }; diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 196fdd02b8b1..be5e2e747f50 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -2,8 +2,15 @@ #ifndef _UAPI_ASM_X86_UNISTD_H #define _UAPI_ASM_X86_UNISTD_H -/* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000UL +/* + * x32 syscall flag bit. Some user programs expect syscall NR macros + * and __X32_SYSCALL_BIT to have type int, even though syscall numbers + * are, for practical purposes, unsigned long. + * + * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right + * thing regardless. + */ +#define __X32_SYSCALL_BIT 0x40000000 #ifndef __KERNEL__ # ifdef __i386__ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index e95b72ec19bc..b8ff9e8ac0d5 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -150,6 +150,9 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" } +#define VMX_EXIT_REASON_FLAGS \ + { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } + #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..8ef4369a4f06 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -90,7 +90,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o @@ -102,9 +101,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -endif +obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 683ed9e12e6b..7bdc0239a943 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -20,11 +20,11 @@ #include <linux/pci.h> #include <linux/efi-bgrt.h> #include <linux/serial_core.h> +#include <linux/pgtable.h> #include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> -#include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/io.h> diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ed3b04483972..cc1fea76aab0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,9 +10,9 @@ #include <linux/memblock.h> #include <linux/dmi.h> #include <linux/cpumask.h> +#include <linux/pgtable.h> #include <asm/segment.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/realmode.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7867dfb3963e..8fd39ff74a49 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,7 +18,6 @@ #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> -#include <asm/pgtable.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> @@ -783,6 +782,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; @@ -957,28 +1011,29 @@ struct bp_patching_desc { static struct bp_patching_desc *bp_desc; -static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +static __always_inline +struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { - struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */ - if (!desc || !atomic_inc_not_zero(&desc->refs)) + if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } -static inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(struct bp_patching_desc *desc) { smp_mb__before_atomic(); - atomic_dec(&desc->refs); + arch_atomic_dec(&desc->refs); } -static inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } -static int notrace patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -988,9 +1043,8 @@ static int notrace patch_cmp(const void *key, const void *elt) return 1; return 0; } -NOKPROBE_SYMBOL(patch_cmp); -int notrace poke_int3_handler(struct pt_regs *regs) +int noinstr poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; @@ -1023,9 +1077,9 @@ int notrace poke_int3_handler(struct pt_regs *regs) * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { - tp = bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), - patch_cmp); + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); if (!tp) goto out_put; } else { @@ -1064,7 +1118,6 @@ out_put: put_desc(desc); return ret; } -NOKPROBE_SYMBOL(poke_int3_handler); #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 16133819415c..17cb5b933dcf 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -33,7 +33,6 @@ #include <linux/atomic.h> #include <linux/dma-direct.h> #include <asm/mtrr.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -159,7 +158,7 @@ static void dump_leak(void) return; dump = 1; - show_stack(NULL, NULL); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index fe698f96617c..263eeaddb0aa 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -345,56 +345,3 @@ out_noapbt: apb_timer_block_enabled = 0; panic("failed to enable APB timer\n"); } - -/* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate(void) -{ - int i, scale; - u64 old, new; - u64 t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - dw_apb_clocksource_start(clocksource_apbt); - - /* check if the timer can count down, otherwise return */ - old = dw_apb_clocksource_read(clocksource_apbt); - i = 10000; - while (--i) { - if (old != dw_apb_clocksource_read(clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq / 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - dw_apb_clocksource_start(clocksource_apbt); - - old = dw_apb_clocksource_read(clocksource_apbt); - old += loop; - - t1 = rdtsc(); - - do { - new = dw_apb_clocksource_read(clocksource_apbt); - } while (new < old); - - t2 = rdtsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * (apbt_freq / 1000)) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; -failed: - return 0; -} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 81b9c63dae1b..9244377ed454 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * According to Intel, MFENCE can do the serialization here. */ asm volatile("mfence" : : : "memory"); - - printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); return; } @@ -546,46 +544,20 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -static u32 hsx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x3a; /* EP */ - case 0x04: return 0x0f; /* EX */ - } - - return ~0U; -} - -static u32 bdx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x00000011; - case 0x03: return 0x0700000e; - case 0x04: return 0x0f00000c; - case 0x05: return 0x0e000003; - } - - return ~0U; -} +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ -static u32 skx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x03: return 0x01000136; - case 0x04: return 0x02000014; - } + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - if (boot_cpu_data.x86_stepping > 4) - return 0; + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), - return ~0U; -} - -static const struct x86_cpu_id deadline_match[] = { - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), @@ -603,34 +575,29 @@ static const struct x86_cpu_id deadline_match[] = { {}, }; -static void apic_check_deadline_errata(void) +static __init bool apic_validate_deadline_timer(void) { const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || - boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; m = x86_match_cpu(deadline_match); if (!m) - return; + return true; - /* - * Function pointers will have the MSB set due to address layout, - * immediate revisions will not. - */ - if ((long)m->driver_data < 0) - rev = ((u32 (*)(void))(m->driver_data))(); - else - rev = (u32)m->driver_data; + rev = (u32)m->driver_data; if (boot_cpu_data.microcode >= rev) - return; + return true; setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " "please update microcode to version: 0x%x (or later)\n", rev); + return false; } /* @@ -1121,23 +1088,14 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); + ack_APIC_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } @@ -2092,7 +2050,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; - apic_check_deadline_errata(); + if (apic_validate_deadline_timer()) + pr_debug("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); @@ -2152,15 +2111,21 @@ void __init register_lapic_address(unsigned long address) * Local APIC interrupts */ -/* - * This interrupt should _never_ happen with our APIC/SMP architecture +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + * + * Also called from sysvec_spurious_apic_interrupt(). */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_IRQ(spurious_interrupt) { - u8 vector = ~regs->orig_ax; u32 v; - entering_irq(); trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2190,13 +2155,17 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) } out: trace_spurious_apic_exit(vector); - exiting_irq(); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) +{ + __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); } /* * This interrupt should never happen with our APIC/SMP architecture */ -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) { static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -2210,7 +2179,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) }; u32 v, i = 0; - entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ @@ -2234,7 +2202,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); - exiting_irq(); } /** diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index cdf45b4700f2..35edd57f064a 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -12,11 +12,11 @@ */ #include <linux/types.h> #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> -#include <asm/pgtable.h> #include "local.h" diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 913c88617848..ce61e3e7d399 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -154,19 +154,6 @@ static inline bool mp_is_legacy_irq(int irq) return irq >= 0 && irq < nr_legacy_irqs(); } -/* - * Initialize all legacy IRQs and all pins on the first IOAPIC - * if we have legacy interrupt controller. Kernel boot option "pirq=" - * may rely on non-legacy pins on the first IOAPIC. - */ -static inline int mp_init_irq_at_boot(int ioapic, int irq) -{ - if (!nr_legacy_irqs()) - return 0; - - return ioapic == 0 || mp_is_legacy_irq(irq); -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 159bd0cb8548..5cbaca58af95 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -115,7 +115,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * denote it as spurious which is no harm as this is a rare event * and interrupt handlers have to cope with spurious interrupts * anyway. If the vector is unused, then it is marked so it won't - * trigger the 'No irq handler for vector' warning in do_IRQ(). + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). * * This requires to hold vector lock to prevent concurrent updates to * the affected vector. diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 67768e54438b..c48be6e1f676 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -861,13 +861,13 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) { struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; - entering_ack_irq(); + ack_APIC_irq(); /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); @@ -892,7 +892,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) } raw_spin_unlock(&vector_lock); - exiting_irq(); } static void __send_cleanup_vector(struct apic_chip_data *apicd) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad53b2abc859..69e70ed0f5e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,8 +30,6 @@ static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; /* Unpack OEM/TABLE ID's to be NULL terminated strings */ @@ -48,11 +46,9 @@ static struct { unsigned int gnode_shift; } uv_cpuid; -int uv_min_hub_revision_id; -EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static int uv_min_hub_revision_id; unsigned int uv_apicid_hibits; -EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; @@ -85,20 +81,7 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { - if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* Base offset bits */ - u64 eu = end & gru_dist_umask; - u64 el = end & gru_dist_lmask; - - /* Must reside completely within a single GRU range: */ - return (sl == gru_dist_base && el == gru_dist_base && - su >= gru_first_node_paddr && - su <= gru_last_node_paddr && - eu == su); - } else { - return start >= gru_start_paddr && end <= gru_end_paddr; - } + return start >= gru_start_paddr && end <= gru_end_paddr; } static bool uv_is_untracked_pat_range(u64 start, u64 end) @@ -385,11 +368,10 @@ int is_uv_hubbed(int uvtype) } EXPORT_SYMBOL_GPL(is_uv_hubbed); -int is_uv_hubless(int uvtype) +static int is_uv_hubless(int uvtype) { return (uv_hubless_system & uvtype); } -EXPORT_SYMBOL_GPL(is_uv_hubless); void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); @@ -417,12 +399,6 @@ static __initdata struct uv_gam_range_s *_gr_table; #define SOCK_EMPTY ((unsigned short)~0) -extern int uv_hub_info_version(void) -{ - return UV_HUB_INFO_VERSION; -} -EXPORT_SYMBOL(uv_hub_info_version); - /* Default UV memory block size is 2GB */ static unsigned long mem_block_size __initdata = (2UL << 30); @@ -590,12 +566,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void uv_send_IPI_one(int cpu, int vector) { - unsigned long apicid; - int pnode; + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; + else + dmode = dest_Fixed; + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); - apicid = per_cpu(x86_cpu_to_apicid, cpu); - pnode = uv_apicid_to_pnode(apicid); - uv_hub_send_ipi(pnode, apicid, vector); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(const struct cpumask *mask, int vector) @@ -797,42 +782,6 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift init_extra_mapping_wb(paddr, bytes); } -static __init void map_gru_distributed(unsigned long c) -{ - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - u64 paddr; - unsigned long bytes; - int nid; - - gru.v = c; - - /* Only base bits 42:28 relevant in dist mode */ - gru_dist_base = gru.v & 0x000007fff0000000UL; - if (!gru_dist_base) { - pr_info("UV: Map GRU_DIST base address NULL\n"); - return; - } - - bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); - gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); - gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ - - for_each_online_node(nid) { - paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | - gru_dist_base; - init_extra_mapping_wb(paddr, bytes); - gru_first_node_paddr = min(paddr, gru_first_node_paddr); - gru_last_node_paddr = max(paddr, gru_last_node_paddr); - } - - /* Save upper (63:M) bits of address only for is_GRU_range */ - gru_first_node_paddr &= gru_dist_umask; - gru_last_node_paddr &= gru_dist_umask; - - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); -} - static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; @@ -846,12 +795,6 @@ static __init void map_gru_high(int max_pnode) return; } - /* Only UV3 has distributed GRU mode */ - if (is_uv3_hub() && gru.s3.mode) { - map_gru_distributed(gru.v); - return; - } - base = (gru.v & mask) >> shift; map_high("GRU", base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)base << shift); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index c2a47016f243..828be792231e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,9 +57,6 @@ int main(void) BLANK(); #undef ENTRY - OFFSET(TSS_ist, tss_struct, x86_tss.ist); - DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - - offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index e1efe44ebefc..83d9cad4e68b 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/audit.h> #include <asm/unistd.h> +#include <asm/audit.h> static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> @@ -41,7 +42,6 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_I386) return ia32_classify_syscall(syscall); #endif diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 676022e71791..1da9b1c9a2db 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,10 +10,10 @@ */ #include <linux/interrupt.h> -#include <asm/acrn.h> #include <asm/apic.h> #include <asm/desc.h> #include <asm/hypervisor.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> static uint32_t __init acrn_detect(void) @@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void) static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); } static bool acrn_x2apic_available(void) @@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void) static void (*acrn_intr_handler)(void); -__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) acrn_intr_handler(); - exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 547ad7bbf0e0..d4806eac9325 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -18,6 +18,7 @@ #include <asm/pci-direct.h> #include <asm/delay.h> #include <asm/debugreg.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -597,6 +598,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; } } + + resctrl_cpu_detect(c); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -1142,8 +1145,7 @@ static const int amd_erratum_383[] = /* #1054: Instructions Retired Performance Counter May Be Inaccurate */ static const int amd_erratum_1054[] = - AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..b6f887be440c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -15,6 +15,7 @@ #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched/smt.h> +#include <linux/pgtable.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -26,7 +27,6 @@ #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> -#include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -398,6 +400,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -1528,6 +1621,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..043d93cdcaad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,6 +21,7 @@ #include <linux/smp.h> #include <linux/io.h> #include <linux/syscore_ops.h> +#include <linux/pgtable.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> @@ -35,7 +36,6 @@ #include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> -#include <asm/pgtable.h> #include <linux/atomic.h> #include <asm/proto.h> #include <asm/setup.h> @@ -387,7 +387,30 @@ set_register: bits_missing); } } -EXPORT_SYMBOL(native_write_cr4); +#if IS_MODULE(CONFIG_LKDTM) +EXPORT_SYMBOL_GPL(native_write_cr4); +#endif + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); void cr4_init(void) { @@ -854,30 +877,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -945,7 +944,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1075,9 +1073,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -1097,31 +1116,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1139,7 +1161,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1148,7 +1179,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1377,20 +1408,6 @@ static void generic_identify(struct cpuinfo_x86 *c) #endif } -static void x86_init_cache_qos(struct cpuinfo_x86 *c) -{ - /* - * The heavy lifting of max_rmid and cache_occ_scale are handled - * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu - * in case CQM bits really aren't there in this CPU. - */ - if (c != &boot_cpu_data) { - boot_cpu_data.x86_cache_max_rmid = - min(boot_cpu_data.x86_cache_max_rmid, - c->x86_cache_max_rmid); - } -} - /* * Validate that ACPI/mptables have the same information about the * effective APIC id and update the package map. @@ -1503,7 +1520,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif x86_init_rdrand(c); - x86_init_cache_qos(c); setup_pku(c); /* @@ -1591,6 +1607,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) @@ -1689,25 +1706,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -DEFINE_PER_CPU(int, debug_stack_usage); -DEFINE_PER_CPU(u32, debug_idt_ctr); - -void debug_stack_set_zero(void) -{ - this_cpu_inc(debug_idt_ctr); - load_current_idt(); -} -NOKPROBE_SYMBOL(debug_stack_set_zero); - -void debug_stack_reset(void) -{ - if (WARN_ON(!this_cpu_read(debug_idt_ctr))) - return; - if (this_cpu_dec_return(debug_idt_ctr) == 0) - load_current_idt(); -} -NOKPROBE_SYMBOL(debug_stack_reset); - #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..fb538fccd24c 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index bf08d4508ecb..63926c94eb5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/pgtable.h> #include <linux/string.h> #include <linux/bitops.h> @@ -11,7 +12,6 @@ #include <linux/uaccess.h> #include <asm/cpufeature.h> -#include <asm/pgtable.h> #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -22,6 +22,7 @@ #include <asm/cpu_device_id.h> #include <asm/cmdline.h> #include <asm/traps.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -322,6 +323,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) detect_ht_early(c); } +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + #ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 @@ -961,6 +967,7 @@ static const struct cpu_dev intel_cpu_dev = { #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; @@ -1119,35 +1126,53 @@ void switch_to_sld(unsigned long tifn) sld_update_msr(!(tifn & _TIF_SLD)); } -#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY} - /* - * The following processors have the split lock detection feature. But - * since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot - * be enumerated. Enable it by family and model matching on these - * processors. + * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should + * only be trusted if it is confirmed that a CPU model implements a + * specific feature at a particular bit position. + * + * The possible driver data field values: + * + * - 0: CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + * + * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use + * bit 5 to enumerate the per-core split-lock detection feature. */ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X), - SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), {} }; void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) { - u64 ia32_core_caps = 0; + const struct x86_cpu_id *m; + u64 ia32_core_caps; - if (c->x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; - if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) { - /* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */ + + m = x86_match_cpu(split_lock_cpu_ids); + if (!m) + return; + + switch (m->driver_data) { + case 0: + break; + case 1: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - } else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { - /* Enumerate split lock detection by family and model. */ - if (x86_match_cpu(split_lock_cpu_ids)) - ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT; + if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) + return; + break; + default: + return; } - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - split_lock_setup(); + split_lock_setup(); } diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index ea3cf714b7ad..99be063fcb1b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -921,14 +921,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 02e1f165f148..30413325de22 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,8 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -128,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +noinstr void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -138,12 +140,12 @@ void mce_setup(struct mce *m) m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_PPIN); else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) - rdmsrl(MSR_AMD_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_AMD_PPIN); m->microcode = boot_cpu_data.microcode; } @@ -1057,23 +1059,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1086,13 +1071,15 @@ static int do_memory_failure(struct mce *m) * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ -static bool __mc_check_crashing_cpu(int cpu) +static noinstr bool mce_check_crashing_cpu(void) { + unsigned int cpu = smp_processor_id(); + if (cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1100,7 +1087,7 @@ static bool __mc_check_crashing_cpu(int cpu) } if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); return true; } } @@ -1175,6 +1162,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1193,12 +1203,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); struct mce m, *final; char *msg = NULL; int worst = 0; @@ -1227,11 +1236,6 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) */ int lmce = 1; - if (__mc_check_crashing_cpu(cpu)) - return; - - ist_enter(regs); - this_cpu_inc(mce_exception_count); mce_gather_info(&m, regs); @@ -1319,17 +1323,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; + return; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { /* * Handle an MCE which has happened in kernel space but from @@ -1341,16 +1347,12 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) * proper one. */ if (m.kflags & MCE_IN_KERNEL_RECOV) { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } } - -out_ist: - ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) @@ -1876,21 +1878,84 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) +static noinstr void unexpected_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); + instrumentation_end(); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; + +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) +{ + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ + if (machine_check_vector == do_machine_check && + mce_check_crashing_cpu()) + return; + + nmi_enter(); + /* + * The call targets are marked noinstr, but objtool can't figure + * that out because it's an indirect call. Annotate it. + */ + instrumentation_begin(); + trace_hardirqs_off_finish(); + machine_check_vector(regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); +} + +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + idtentry_enter_user(regs); + instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); + idtentry_exit_user(regs); +} -dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) { - machine_check_vector(regs, error_code); + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_user(regs); + local_db_restore(dr7); } -NOKPROBE_SYMBOL(do_mce); +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3413b41b8d55..0593b192eb8f 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) regs.cs = m->cs; pregs = ®s; } - /* in mcheck exeception handler, irq will be disabled */ + /* do_machine_check() expects interrupts disabled -- at least */ local_irq_save(flags); - do_machine_check(pregs, 0); + do_machine_check(pregs); local_irq_restore(flags); m->finished = 0; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 55f5c7b755f2..6473070b5da4 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -9,7 +9,7 @@ #include <asm/mce.h> /* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *); enum severity_level { MCE_NO_SEVERITY, diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..19e90cae8e97 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -20,12 +21,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs, long error_code) +static noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; - ist_enter(regs); - + instrumentation_begin(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -38,8 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index f36dc0742085..a7cd2d203ced 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) { - entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 28812cc15300..6a059a035021 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -21,12 +21,11 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..9c9f0abd2d7f 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -16,14 +17,12 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs, long error_code) +static noinstr void winchip_machine_check(struct pt_regs *regs) { - ist_enter(regs); - + instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7019d4b2df0c..baec68b7e010 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -545,8 +545,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout) /* * Returns: * < 0 - on error - * 0 - no update done - * 1 - microcode was updated + * 0 - success (no update done or microcode was updated) */ static int __reload_late(void *info) { @@ -573,11 +572,11 @@ static int __reload_late(void *info) else goto wait_for_siblings; - if (err > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; - } else if (err == UCODE_UPDATED || err == UCODE_OK) { - ret = 1; } wait_for_siblings: @@ -608,7 +607,7 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret > 0) + if (ret == 0) microcode_check(); pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); @@ -649,7 +648,7 @@ static ssize_t reload_store(struct device *dev, put: put_online_cpus(); - if (ret >= 0) + if (ret == 0) ret = size; return ret; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index caa032ce3fe3..af94f05a5c66 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -23,6 +23,7 @@ #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/desc.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> @@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); @@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); * Routines to do per-architecture handling of stimer0 * interrupts when in Direct Mode */ - -__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -227,8 +223,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x\n", - ms_hyperv.features, ms_hyperv.hints); + pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -263,6 +259,16 @@ static void __init ms_hyperv_init_platform(void) cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); } + /* + * Hyper-V expects to get crash register data or kmsg when + * crash enlightment is available and system crashes. Set + * crash_kexec_post_notifiers to be true to make sure that + * calling crash enlightment interface before running kdump + * kernel. + */ + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) + crash_kexec_post_notifiers = true; + #ifdef CONFIG_X86_LOCAL_APIC if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { @@ -321,17 +327,19 @@ static void __init ms_hyperv_init_platform(void) x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_vector); + asm_sysvec_hyperv_reenlightenment); + } /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { alloc_intr_gate(HYPERV_STIMER0_VECTOR, - hv_stimer0_callback_vector); + asm_sysvec_hyperv_stimer0); + } # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 51b9190c628b..23ad8e953dfb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); @@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 89049b343c7a..12f967c6b603 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" /* Mutex to protect rdtgroup access. */ @@ -578,6 +578,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) d->id = id; cpumask_set_cpu(cpu, &d->cpu_mask); + rdt_domain_reconfigure_cdp(r); + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { kfree(d); return; @@ -956,6 +958,36 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b531..934c8fb8a64a 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); if (rr.val & RMID_VAL_ERROR) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 181c992f448c..f20a47d120b1 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -31,7 +31,7 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -40,6 +40,12 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { @@ -87,6 +93,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; int evtid; bool first; @@ -460,6 +467,7 @@ struct rdt_resource { struct list_head evt_list; int num_rmid; unsigned int mon_scale; + unsigned int mbm_width; unsigned long fflags; }; @@ -587,8 +595,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id); void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); @@ -601,5 +610,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 773124b0e18a..837d7d012b7b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -214,9 +214,9 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; @@ -256,7 +256,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); m->chunks += chunks; m->prev_msr = tval; @@ -278,7 +278,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) return; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); m->chunks_bw += chunks; m->chunks = m->chunks_bw; cur_bw = (chunks * r->mon_scale) >> 20; @@ -433,11 +433,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -510,6 +511,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -517,16 +519,18 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + d = get_domain_from_cpu(cpu, r); if (!d) goto out_unlock; list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -614,11 +618,18 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; unsigned int cl_size = boot_cpu_data.x86_cache_size; int ret; r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..0daf2f1cf7a8 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include <asm/cacheflush.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/perf_event.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ @@ -1326,9 +1326,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * pseudo-locked region will still be here on return. * * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_sem semaphore which is obtained in - * the device_create() and debugfs_create_dir() callpath below - * as well as before the mmap() callback is called. + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. */ mutex_unlock(&rdtgroup_mutex); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 064e9ef44cd6..23b4b61319d3 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -29,7 +29,7 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); @@ -1859,6 +1859,19 @@ static int set_cache_qos_cfg(int level, bool enable) return 0; } +/* Restore the qos cfg state when a domain comes online */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r) +{ + if (!r->alloc_capable) + return; + + if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA]) + l2_qos_cfg_update(&r->alloc_enabled); + + if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA]) + l3_qos_cfg_update(&r->alloc_enabled); +} + /* * Enable or disable the MBA software controller * which helps user specify bandwidth in MBps. @@ -2459,7 +2472,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; @@ -3072,7 +3085,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) * If the rdtgroup is a mon group and parent directory * is a valid "mon_groups" directory, remove the mon group. */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { ret = rdtgroup_ctrl_remove(kn, rdtgrp); @@ -3185,10 +3199,10 @@ int __init rdtgroup_init(void) * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_sem - * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex * is taken, thus creating dependency: - * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause * issues considering the other two lock dependencies. * By creating the debugfs directory here we avoid a dependency * that may cause deadlock (even though file operations cannot diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c index c0159a7bca6d..8a89c109e20a 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/crash_core_32.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/crash_core.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/setup.h> void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c index 845a57eb4eb7..7d255f882afe 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/crash_core_64.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/crash_core.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/setup.h> void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 3793646f0fb5..759d392cbe9f 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -6,12 +6,10 @@ #include <linux/fs.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include <asm/desc.h> #include <asm/traps.h> -extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) @@ -22,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu); * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks * we're running the doublefault task. Cannot return. */ -asmlinkage notrace void __noreturn doublefault_shim(void) +asmlinkage noinstr void __noreturn doublefault_shim(void) { unsigned long cr2; struct pt_regs regs; @@ -41,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) * Fill in pt_regs. A downside of doing this in C is that the unwinder * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump * won't successfully unwind to the source of the double fault. - * The main dump from do_double_fault() is fine, though, since it + * The main dump from exc_double_fault() is fine, though, since it * uses these regs directly. * * If anyone ever cares, this could be moved to asm. @@ -71,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) regs.cx = TSS(cx); regs.bx = TSS(bx); - do_double_fault(®s, 0, cr2); + exc_double_fault(®s, 0, cr2); /* * x86_32 does not save the original CR3 anywhere on a task switch. @@ -85,7 +83,6 @@ asmlinkage notrace void __noreturn doublefault_shim(void) */ panic("cannot return from double fault\n"); } -NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -96,7 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) double_fault, + .ip = (unsigned long) asm_exc_double_fault, .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f..456511b2284e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); @@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; @@ -279,7 +279,8 @@ next: } } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl) { task = task ? : current; @@ -290,7 +291,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, loglvl); } void show_stack_regs(struct pt_regs *regs) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 8e3a8fedfa4d..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); struct doublefault_stack *ss = &cea->doublefault_stack; @@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); return true; -#else - return false; -#endif } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 87b97897a881..4a94d38cd141 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -22,15 +22,13 @@ static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", - [ ESTACK_DB2 ] = "#DB2", - [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -79,7 +77,6 @@ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), - EPAGERANGE(DB1), EPAGERANGE(DB), EPAGERANGE(MCE), }; @@ -91,7 +88,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* @@ -183,7 +180,8 @@ recursion_check: */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { - printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + if (task == current) + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..4d13c57f370a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real memory size before the original memory map is - * reset. - */ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b33904251a9..d3c531d3b244 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -8,6 +8,7 @@ #include <linux/pci_regs.h> #include <linux/pci_ids.h> #include <linux/errno.h> +#include <linux/pgtable.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> @@ -15,12 +16,8 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> -#include <asm/intel-mid.h> -#include <asm/pgtable.h> #include <linux/usb/ehci_def.h> #include <linux/usb/xhci-dbgp.h> -#include <linux/efi.h> -#include <asm/efi.h> #include <asm/pci_x86.h> /* Simple VGA output */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 12e7d4406c32..4fe7af58cfe1 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -29,7 +29,7 @@ #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/random.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12c70840980e..06c818967bb6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -291,15 +291,13 @@ void fpu__drop(struct fpu *fpu) } /* - * Clear FPU registers by setting them up from - * the init fpstate: + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(void) +static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { - fpregs_lock(); - if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, -1); + copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else @@ -307,9 +305,6 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); - - fpregs_mark_activate(); - fpregs_unlock(); } /* @@ -318,18 +313,40 @@ static inline void copy_init_fpstate_to_fpregs(void) * Called by sys_execve(), by the signal handler code and by various * error paths. */ -void fpu__clear(struct fpu *fpu) +static void fpu__clear(struct fpu *fpu, bool user_only) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); - fpu__drop(fpu); + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpu__drop(fpu); + fpu__initialize(fpu); + return; + } - /* - * Make sure fpstate is cleared and initialized. - */ - fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) - copy_init_fpstate_to_fpregs(); + fpregs_lock(); + + if (user_only) { + if (!fpregs_state_valid(fpu, smp_processor_id()) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + } else { + copy_init_fpstate_to_fpregs(xfeatures_mask_all); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu__clear_user_states(struct fpu *fpu) +{ + fpu__clear(fpu, true); +} + +void fpu__clear_all(struct fpu *fpu) +{ + fpu__clear(fpu, false); } /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6ce7e0a23268..61ddc3a5e5c2 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ u64 __init fpu__get_supported_xfeatures_mask(void) { - return XCNTXT_MASK; + return XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; } /* Legacy code to initialize eager fpu mode. */ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d652b939ccfb..bd1d0649f8ce 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); if (!ret) - ret = validate_xstate_header(&xsave->header); + ret = validate_user_xstate_header(&xsave->header); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 400a05e1c1c5..9393a445d73c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -211,9 +211,9 @@ retry: } static inline void -sanitize_restored_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) +sanitize_restored_user_xstate(union fpregs_state *state, + struct user_i387_ia32_struct *ia32_env, + u64 user_xfeatures, int fx_only) { struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; @@ -226,13 +226,22 @@ sanitize_restored_xstate(union fpregs_state *state, */ /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. + * 'user_xfeatures' might have bits clear which are + * set in header->xfeatures. This represents features that + * were in init state prior to a signal delivery, and need + * to be reset back to the init state. Clear any user + * feature bits which are set in the kernel buffer to get + * them back to the init state. + * + * Supervisor state is unchanged by input from userspace. + * Ensure supervisor state bits stay set and supervisor + * state is not modified. */ if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= xfeatures; + header->xfeatures &= user_xfeatures | + xfeatures_mask_supervisor(); } if (use_fxsr()) { @@ -252,16 +261,24 @@ sanitize_restored_xstate(union fpregs_state *state, */ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { + u64 init_bv; + int r; + if (use_xsave()) { if (fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_fxregs(buf); + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + + r = copy_user_to_fxregs(buf); + if (!r) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return r; } else { - u64 init_bv = xfeatures_mask & ~xbv; - if (unlikely(init_bv)) + init_bv = xfeatures_mask_user() & ~xbv; + + r = copy_user_to_xregs(buf, xbv); + if (!r && unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_xregs(buf, xbv); + return r; } } else if (use_fxsr()) { return copy_user_to_fxregs(buf); @@ -277,7 +294,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; - u64 xfeatures = 0; + u64 user_xfeatures = 0; int fx_only = 0; int ret = 0; @@ -285,7 +302,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { - fpu__clear(fpu); + fpu__clear_user_states(fpu); return 0; } @@ -310,32 +327,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; + user_xfeatures = fx_sw_user.xfeatures; } } - /* - * The current state of the FPU registers does not matter. By setting - * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate - * is not modified on context switch and that the xstate is considered - * to be loaded again on return to userland (overriding last_cpu avoids - * the optimisation). - */ - set_thread_flag(TIF_NEED_FPU_LOAD); - __fpu_invalidate_fpregs_state(fpu); - if ((unsigned long)buf_fx % 64) fx_only = 1; - /* - * For 32-bit frames with fxstate, copy the fxstate so it can be - * reconstructed later. - */ - if (ia32_fxstate) { - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - goto err_out; - envp = &env; - } else { + + if (!ia32_fxstate) { /* * Attempt to restore the FPU registers directly from user * memory. For that to succeed, the user access cannot cause @@ -345,20 +344,65 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpregs_lock(); pagefault_disable(); - ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); pagefault_enable(); if (!ret) { + + /* + * Restore supervisor states: previous context switch + * etc has done XSAVES and saved the supervisor states + * in the kernel buffer from which they can be restored + * now. + * + * We cannot do a single XRSTORS here - which would + * be nice - because the rest of the FPU registers are + * being restored from a user buffer directly. The + * single XRSTORS happens below, when the user buffer + * has been copied to the kernel one. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); fpregs_mark_activate(); fpregs_unlock(); return 0; } - fpregs_deactivate(fpu); fpregs_unlock(); + } else { + /* + * For 32-bit frames with fxstate, copy the fxstate so it can + * be reconstructed later. + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; } + /* + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + fpregs_lock(); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + + /* + * Supervisor states are not modified by user space input. Save + * current supervisor states first and invalidate the FPU regs. + */ + if (xfeatures_mask_supervisor()) + copy_supervisor_to_kernel(&fpu->state.xsave); + set_thread_flag(TIF_NEED_FPU_LOAD); + } + __fpu_invalidate_fpregs_state(fpu); + fpregs_unlock(); if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; + u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; if (using_compacted_format()) { ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); @@ -366,17 +410,24 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&fpu->state.xsave.header); + ret = validate_user_xstate_header(&fpu->state.xsave.header); } if (ret) goto err_out; - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); + + /* + * Restore previously saved supervisor xstates along with + * copied-in user xstates. + */ + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, + user_xfeatures | xfeatures_mask_supervisor()); } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); @@ -385,11 +436,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) goto err_out; } - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + u64 init_bv; + + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } @@ -410,7 +464,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) err_out: if (ret) - fpu__clear(fpu); + fpu__clear_user_states(fpu); return ret; } @@ -465,7 +519,7 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xfeatures = xfeatures_mask_user(); fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 32b153d38748..bda2e5eaca0e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -54,13 +54,15 @@ static short xsave_cpuid_features[] __initdata = { }; /* - * Mask of xstate features supported by the CPU and the kernel: + * This represents the full set of bits that should ever be set in a kernel + * XSAVE buffer, both supervisor and user xstates. */ -u64 xfeatures_mask __read_mostly; +u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -76,7 +78,7 @@ unsigned int fpu_user_xstate_size; */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -150,7 +152,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) + if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) return; /* @@ -177,7 +179,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * in a special way already: */ feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; /* * Update all the remaining memory layouts according to their @@ -205,30 +207,39 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) + u64 unsup_bits; + + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) return; /* - * Make it clear that XSAVES supervisor states are not yet - * implemented should anyone expect it to work by changing - * bits in XFEATURE_MASK_* macros and XCR0. + * Unsupported supervisor xstates should not be found in + * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), - "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", + unsup_bits); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } -/* - * Note that in the future we will likely need a pair of - * functions here: one for user xstates and the other for - * system xstates. For now, they are the same. - */ -static int xfeature_enabled(enum xfeature xfeature) +static bool xfeature_enabled(enum xfeature xfeature) { - return !!(xfeatures_mask & (1UL << xfeature)); + return xfeatures_mask_all & BIT_ULL(xfeature); } /* @@ -383,6 +394,33 @@ static void __init setup_xstate_comp_offsets(void) } /* + * Setup offsets of a supervisor-state-only XSAVES buffer: + * + * The offsets stored in xstate_comp_offsets[] only work for one specific + * value of the Requested Feature BitMap (RFBM). In cases where a different + * RFBM value is used, a different set of offsets is required. This set of + * offsets is for when RFBM=xfeatures_mask_supervisor(). + */ +static void __init setup_supervisor_only_offsets(void) +{ + unsigned int next_offset; + int i; + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_supervisor_only_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + +/* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) @@ -415,7 +453,7 @@ static void __init setup_init_fpu_buf(void) if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; + xfeatures_mask_all; /* * Init all the features state with header.xfeatures being 0x0 @@ -438,7 +476,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) * format. Checking a supervisor state's uncompacted offset is * an error. */ - if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) { + if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); return -1; } @@ -472,10 +510,10 @@ int using_compacted_format(void) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_xstate_header(const struct xstate_header *hdr) +int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + if (hdr->xfeatures & ~xfeatures_mask_user()) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -610,15 +648,12 @@ static void do_extra_xstate_size_checks(void) /* - * Get total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. - * - * Note that we do not currently set any bits on IA32_XSS so - * 'XCR0 | IA32_XSS == XCR0' for now. */ static unsigned int __init get_xsaves_size(void) { @@ -700,7 +735,7 @@ static int __init init_xstate_size(void) */ static void fpu__init_disable_system_xstate(void) { - xfeatures_mask = 0; + xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); } @@ -735,16 +770,26 @@ void __init fpu__init_system_xstate(void) return; } + /* + * Find user xstates supported by the processor. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); + xfeatures_mask_all = eax + ((u64)edx << 32); - if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + xfeatures_mask_all |= ecx + ((u64)edx << 32); + + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + xfeatures_mask_all); goto out_disable; } @@ -753,10 +798,10 @@ void __init fpu__init_system_xstate(void) */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask &= ~BIT(i); + xfeatures_mask_all &= ~BIT_ULL(i); } - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -768,15 +813,16 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); + setup_supervisor_only_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask, + xfeatures_mask_all, fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; @@ -795,7 +841,14 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } /* @@ -840,10 +893,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that xfeatures_mask is - * what we write to the XCR0 register. + * have not enabled. */ - WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -957,18 +1009,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) return true; } -/* - * This is similar to user_regset_copyout(), but will not add offset to - * the source data pointer or increment pos, count, kbuf, and ubuf. - */ -static inline void -__copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int offset, unsigned int size, unsigned int size_total) +static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count) { - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); + if (*pos < to) { + unsigned size = to - *pos; + + if (size > *count) + size = *count; + memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size); + *kbuf += size; + *pos += size; + *count -= size; + } +} - memcpy(kbuf + offset, data, copy); +static void copy_part(unsigned offset, unsigned size, void *from, + void **kbuf, unsigned *pos, unsigned *count) +{ + fill_gap(offset, kbuf, pos, count); + if (size > *count) + size = *count; + if (size) { + memcpy(*kbuf, from, size); + *kbuf += size; + *pos += size; + *count -= size; } } @@ -981,8 +1046,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, */ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { - unsigned int offset, size; struct xstate_header header; + const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr); + unsigned count = size_total; int i; /* @@ -996,48 +1062,44 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; - + header.xfeatures &= xfeatures_mask_user(); + + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(0, off_mxcsr, + &xsave->i387, &kbuf, &offset_start, &count); + if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)) + copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE, + &xsave->i387.mxcsr, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(offsetof(struct fxregs_state, st_space), 128, + &xsave->i387.st_space, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_SSE) + copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256, + &xsave->i387.xmm_space, &kbuf, &offset_start, &count); + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + copy_part(offsetof(struct fxregs_state, sw_reserved), 48, + xstate_fx_sw_bytes, &kbuf, &offset_start, &count); /* * Copy xregs_state->header: */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); + copy_part(offsetof(struct xregs_state, header), sizeof(header), + &header, &kbuf, &offset_start, &count); - __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); - - for (i = 0; i < XFEATURE_MAX; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { /* * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { void *src = __raw_xsave_addr(xsave, i); - offset = xstate_offsets[i]; - size = xstate_sizes[i]; - - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; - - __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); + copy_part(xstate_offsets[i], xstate_sizes[i], + src, &kbuf, &offset_start, &count); } } - - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); - } - - /* - * Fill xsave->i387.sw_reserved value for ptrace frame: - */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); - - __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + fill_gap(size_total, &kbuf, &offset_start, &count); return 0; } @@ -1080,7 +1142,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: @@ -1147,7 +1209,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1173,7 +1235,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1201,7 +1263,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1229,7 +1291,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1239,6 +1301,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } +/* + * Save only supervisor states to the kernel buffer. This blows away all + * old states, and is intended to be used only in __fpu__restore_sig(), where + * user states are restored from the user buffer. + */ +void copy_supervisor_to_kernel(struct xregs_state *xstate) +{ + struct xstate_header *header; + u64 max_bit, min_bit; + u32 lmask, hmask; + int err, i; + + if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (!xfeatures_mask_supervisor()) + return; + + max_bit = __fls(xfeatures_mask_supervisor()); + min_bit = __ffs(xfeatures_mask_supervisor()); + + lmask = xfeatures_mask_supervisor(); + hmask = xfeatures_mask_supervisor() >> 32; + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + if (WARN_ON_FPU(err)) + return; + + /* + * At this point, the buffer has only supervisor states and must be + * converted back to normal kernel format. + */ + header = &xstate->header; + header->xcomp_bv |= xfeatures_mask_all; + + /* + * This only moves states up in the buffer. Start with + * the last state and move backwards so that states are + * not overwritten until after they are moved. Note: + * memmove() allows overlapping src/dst buffers. + */ + for (i = max_bit; i >= min_bit; i--) { + u8 *xbuf = (u8 *)xstate; + + if (!((header->xfeatures >> i) & 1)) + continue; + + /* Move xfeature 'i' into its normal location */ + memmove(xbuf + xstate_comp_offsets[i], + xbuf + xstate_supervisor_only_offsets[i], + xstate_sizes[i]); + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 37a0aeaf89e7..c84d28e90a58 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -282,7 +282,8 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_epilogue(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -334,7 +335,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; } @@ -366,6 +367,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); + ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + if (WARN_ON(ret < 0)) + goto fail; + } + /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to @@ -407,7 +415,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) set_vm_flush_reset_perms(trampoline); - set_memory_ro((unsigned long)trampoline, npages); + if (likely(system_state != SYSTEM_BOOTING)) + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -415,6 +424,32 @@ fail: return 0; } +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + static unsigned long calc_trampoline_call_offset(bool save_regs) { unsigned long start_offset; diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..e405fe1a8bf4 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 369e61faacfe..083a3da7bb73 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -12,7 +12,7 @@ #include <asm/frame.h> .code64 - .section .entry.text, "ax" + .section .text, "ax" #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ @@ -23,7 +23,7 @@ #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) /* * gcc -pg option adds a call to 'mcount' in most functions. @@ -77,7 +77,7 @@ /* * We add enough stack to save all regs. */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rcx, RCX(%rsp) movq %rdx, RDX(%rsp) @@ -157,8 +157,12 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * think twice before adding any new code or changing the * layout here. */ -SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + jmp ftrace_epilogue +SYM_FUNC_END(ftrace_caller); + +SYM_FUNC_START(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub @@ -170,14 +174,12 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -SYM_FUNC_END(ftrace_caller) +SYM_FUNC_END(ftrace_epilogue) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq - UNWIND_HINT_SAVE - /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -233,10 +235,13 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq ORIG_RAX(%rsp), %rax movq %rax, MCOUNT_REG_SIZE-8(%rsp) - /* If ORIG_RAX is anything but zero, make this a call to that */ + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ movq ORIG_RAX(%rsp), %rax - cmpq $0, %rax - je 1f + testq %rax, %rax + jz 1f /* Swap the flags with orig_rax */ movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -244,20 +249,14 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq %rax, MCOUNT_REG_SIZE(%rsp) restore_mcount_regs 8 + /* Restore flags */ + popfq - jmp 2f +SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue 1: restore_mcount_regs - - -2: - /* - * The stack layout is nondetermistic here, depending on which path was - * taken. This confuses objtool and ORC, rightfully so. For now, - * pretend the stack always looks like the non-direct case. - */ - UNWIND_HINT_RESTORE - /* Restore flags */ popfq @@ -268,7 +267,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * to the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) - jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) @@ -303,7 +301,7 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs jmp fgraph_trace @@ -340,6 +338,6 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - JMP_NOSPEC %rdi + JMP_NOSPEC rdi SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 206a4b6144c2..cbb71c1b574f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -20,13 +20,13 @@ #include <linux/io.h> #include <linux/memblock.h> #include <linux/mem_encrypt.h> +#include <linux/pgtable.h> #include <asm/processor.h> #include <asm/proto.h> #include <asm/smp.h> #include <asm/setup.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4bbc770af632..16da4ac01597 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -13,8 +13,8 @@ #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/segment.h> -#include <asm/pgtable.h> #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> @@ -29,15 +29,16 @@ #ifdef CONFIG_PARAVIRT_XXL #include <asm/asm-offsets.h> #include <asm/paravirt.h> +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg #else #define INTERRUPT_RETURN iretq +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg #endif -/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. - * */ - #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d8d53ed02c9..8cdf29ffd95f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,8 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/user.h> +#include <asm/desc.h> +#include <asm/tlbflush.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -97,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp) unsigned long *dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -115,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp) dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + set_debugreg(*dr7, 7); if (info->mask) set_dr_addr_mask(info->mask, i); @@ -134,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp) void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); - unsigned long *dr7; + unsigned long dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -149,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = this_cpu_ptr(&cpu_dr7); - *dr7 &= ~__encode_dr7(i, info->len, info->type); + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); - set_debugreg(*dr7, 7); + set_debugreg(dr7, 7); if (info->mask) set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) @@ -227,10 +247,76 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + +/* + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. + */ +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) +{ + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE)) + return true; + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; + } + + return false; +} + static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry(attr->bp_addr, bp_end)) + return -EINVAL; + hw->address = attr->bp_addr; hw->mask = 0; @@ -439,7 +525,7 @@ static int hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr7, dr6; + unsigned long dr6; unsigned long *dr6_p; /* The DR6 value is pointed by args->err */ @@ -454,9 +540,6 @@ static int hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - get_debugreg(dr7, 7); - /* Disable breakpoints during exception handling */ - set_debugreg(0UL, 7); /* * Assert that local interrupts are disabled * Reset the DRn bits in the virtualized register value. @@ -513,7 +596,6 @@ static int hw_breakpoint_handler(struct die_args *args) (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - set_debugreg(dr7, 7); put_cpu(); return rc; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 519649ddf100..f3c76252247d 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,11 +15,11 @@ #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> +#include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> -#include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87ef69a72c52..0db21206f2f3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -4,6 +4,8 @@ */ #include <linux/interrupt.h> +#include <asm/cpu_entry_area.h> +#include <asm/set_memory.h> #include <asm/traps.h> #include <asm/proto.h> #include <asm/desc.h> @@ -51,15 +53,23 @@ struct idt_data { #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) + +static bool idt_setup_done __initdata; + /* * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_idts[] = { - INTG(X86_TRAP_DB, debug), - SYSG(X86_TRAP_BP, int3), + INTG(X86_TRAP_DB, asm_exc_debug), + SYSG(X86_TRAP_BP, asm_exc_int3), + #ifdef CONFIG_X86_32 - INTG(X86_TRAP_PF, page_fault), + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ + INTG(X86_TRAP_PF, asm_exc_page_fault), #endif }; @@ -70,33 +80,33 @@ static const __initconst struct idt_data early_idts[] = { * set up TSS. */ static const __initconst struct idt_data def_idts[] = { - INTG(X86_TRAP_DE, divide_error), - INTG(X86_TRAP_NMI, nmi), - INTG(X86_TRAP_BR, bounds), - INTG(X86_TRAP_UD, invalid_op), - INTG(X86_TRAP_NM, device_not_available), - INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), - INTG(X86_TRAP_TS, invalid_TSS), - INTG(X86_TRAP_NP, segment_not_present), - INTG(X86_TRAP_SS, stack_segment), - INTG(X86_TRAP_GP, general_protection), - INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), - INTG(X86_TRAP_MF, coprocessor_error), - INTG(X86_TRAP_AC, alignment_check), - INTG(X86_TRAP_XF, simd_coprocessor_error), + INTG(X86_TRAP_DE, asm_exc_divide_error), + INTG(X86_TRAP_NMI, asm_exc_nmi), + INTG(X86_TRAP_BR, asm_exc_bounds), + INTG(X86_TRAP_UD, asm_exc_invalid_op), + INTG(X86_TRAP_NM, asm_exc_device_not_available), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), + INTG(X86_TRAP_SS, asm_exc_stack_segment), + INTG(X86_TRAP_GP, asm_exc_general_protection), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), + INTG(X86_TRAP_AC, asm_exc_alignment_check), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, double_fault), + INTG(X86_TRAP_DF, asm_exc_double_fault), #endif - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, &machine_check), + INTG(X86_TRAP_MC, asm_exc_machine_check), #endif - SYSG(X86_TRAP_OF, overflow), + SYSG(X86_TRAP_OF, asm_exc_overflow), #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) @@ -109,95 +119,63 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), - INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), - INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), - INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), - INTG(REBOOT_VECTOR, reboot_interrupt), + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif #ifdef CONFIG_X86_THERMAL_VECTOR - INTG(THERMAL_APIC_VECTOR, thermal_interrupt), + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), #endif #ifdef CONFIG_X86_MCE_THRESHOLD - INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), #endif #ifdef CONFIG_X86_MCE_AMD - INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK - INTG(IRQ_WORK_VECTOR, irq_work_interrupt), + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -#ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), -#endif - INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), - INTG(ERROR_APIC_VECTOR, error_interrupt), +# ifdef CONFIG_X86_UV + INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message), +# endif + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; -#ifdef CONFIG_X86_64 -/* - * Early traps running on the DEFAULT_STACK because the other interrupt - * stacks work only after cpu_init(). - */ -static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, page_fault), -}; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, debug), -}; -#endif - -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; struct desc_ptr idt_descr __ro_after_init = { - .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; -#ifdef CONFIG_X86_64 -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; - -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), -#endif -}; +void load_current_idt(void) +{ + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -const struct desc_ptr debug_idt_descr = { - .size = IDT_ENTRIES * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; +} #endif static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) @@ -214,7 +192,7 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) #endif } -static void +static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc; @@ -227,7 +205,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy } } -static void set_intr_gate(unsigned int n, const void *addr) +static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; @@ -266,6 +244,27 @@ void __init idt_setup_traps(void) } #ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initconst struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +}; + /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -273,8 +272,10 @@ void __init idt_setup_traps(void) * cpu_init() is invoked and sets up TSS. The IST variant is installed * after that. * - * FIXME: Why is 32bit and 64bit installing the PF handler at different - * places in the early setup code? + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory intialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. */ void __init idt_setup_early_pf(void) { @@ -289,17 +290,20 @@ void __init idt_setup_ist_traps(void) { idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); } +#endif -/** - * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps - */ -void __init idt_setup_debugidt_traps(void) +static void __init idt_map_in_cea(void) { - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - - idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; } -#endif /** * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates @@ -318,11 +322,23 @@ void __init idt_setup_apic_and_irq_gates(void) #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { - set_bit(i, system_vectors); + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); set_intr_gate(i, entry); } #endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + + idt_setup_done = true; } /** @@ -352,16 +368,14 @@ void idt_invalidate(void *addr) load_idt(&idt); } -void __init update_intr_gate(unsigned int n, const void *addr) +void __init alloc_intr_gate(unsigned int n, const void *addr) { - if (WARN_ON_ONCE(!test_bit(n, system_vectors))) + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; - set_intr_gate(n, addr); -} -void alloc_intr_gate(unsigned int n, const void *addr) -{ - BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, system_vectors)) + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) set_intr_gate(n, addr); } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index a53e7b4a7419..e2fab3ceb09f 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,15 +33,15 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(void) +static void task_update_io_bitmap(struct task_struct *tsk) { - struct thread_struct *t = ¤t->thread; + struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { /* TSS update is handled on exit to user space */ - set_thread_flag(TIF_IO_BITMAP); + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } else { - clear_thread_flag(TIF_IO_BITMAP); + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); /* Invalidate TSS */ preempt_disable(); tss_update_io_bitmap(); @@ -49,12 +49,12 @@ static void task_update_io_bitmap(void) } } -void io_bitmap_exit(void) +void io_bitmap_exit(struct task_struct *tsk) { - struct io_bitmap *iobm = current->thread.io_bitmap; + struct io_bitmap *iobm = tsk->thread.io_bitmap; - current->thread.io_bitmap = NULL; - task_update_io_bitmap(); + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -102,7 +102,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if (!iobm) return -ENOMEM; refcount_set(&iobm->refcnt, 1); - io_bitmap_exit(); + io_bitmap_exit(current); } /* @@ -134,7 +134,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) } /* All permissions dropped? */ if (max_long == UINT_MAX) { - io_bitmap_exit(); + io_bitmap_exit(current); return 0; } @@ -192,7 +192,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(); + task_update_io_bitmap(current); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7965ff429c5..181060247e3c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -13,12 +13,14 @@ #include <linux/export.h> #include <linux/irq.h> +#include <asm/irq_stack.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> +#include <asm/traps.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -26,9 +28,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - atomic_t irq_err_count; /* @@ -224,35 +223,35 @@ u64 arch_irq_stat(void) return sum; } +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + run_on_irqstack_cond(desc->handle_irq, desc, regs); + else + __handle_irq(desc, regs); +} /* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). */ -__visible void __irq_entry do_IRQ(struct pt_regs *regs) +DEFINE_IDTENTRY_IRQ(common_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc * desc; - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_ax; - - entering_irq(); + struct irq_desc *desc; - /* entering_irq() tells RCU that we're not quiescent. Check it. */ + /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { - if (IS_ENABLED(CONFIG_X86_32)) - handle_irq(desc, regs); - else - generic_handle_irq_desc(desc); + handle_irq(desc, regs); } else { ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { @@ -260,8 +259,6 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs) } } - exiting_irq(); - set_irq_regs(old_regs); } @@ -271,17 +268,16 @@ void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_ack_irq(); + ack_APIC_irq(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } #endif @@ -302,41 +298,29 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. */ -__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_ipis); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ -__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_NESTED_VECTOR. */ -__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_nested_ipis); - exiting_irq(); - set_irq_regs(old_regs); } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a759ca97cd01..0b79efc87be5 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,7 +148,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -void handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..1b4fe93a86c5 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include <linux/sched/task_stack.h> #include <asm/cpu_entry_area.h> +#include <asm/irq_stack.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -43,7 +44,7 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; @@ -70,3 +71,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; return map_irq_stack(cpu); } + +void do_softirq_own_stack(void) +{ + run_on_irqstack_cond(__do_softirq, NULL, NULL); +} diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 80bee7695a20..890d4778cd35 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,18 +9,18 @@ #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> +#include <asm/idtentry.h> #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> #ifdef CONFIG_X86_LOCAL_APIC -__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); - exiting_irq(); } void arch_irq_work_raise(void) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 5aa523c2d573..dd73135d7cee 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,11 +16,11 @@ #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> +#include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> -#include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4d7022a740ab..3bafe1bd4dc7 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -41,11 +41,11 @@ #include <linux/kasan.h> #include <linux/moduleloader.h> #include <linux/vmalloc.h> +#include <linux/pgtable.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> @@ -1073,13 +1073,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); int __init arch_populate_kprobe_blacklist(void) { - int ret; - - ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, - (unsigned long)__irqentry_text_end); - if (ret) - return ret; - return kprobe_add_area_blacklist((unsigned long)__entry_text_start, (unsigned long)__entry_text_end); } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index ea13f6888284..321c19950285 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -16,11 +16,11 @@ #include <linux/kallsyms.h> #include <linux/ftrace.h> #include <linux/frame.h> +#include <linux/pgtable.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> @@ -286,9 +286,7 @@ static int can_optimize(unsigned long paddr) * stack handling and registers setup. */ if (((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) || - ((paddr >= (unsigned long)__irqentry_text_start) && - (paddr < (unsigned long)__irqentry_text_end))) + (paddr < (unsigned long)__entry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6efe0410fb72..d00f7c430e65 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -35,6 +35,8 @@ #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + static int kvmapf = 1; static int __init parse_no_kvmapf(char *arg) @@ -73,7 +75,6 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool halted; }; static struct kvm_task_sleep_head { @@ -96,77 +97,64 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); - - rcu_irq_exit(); - return; + kfree(e); + return false; } - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; - rcu_irq_exit(); - - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } - - rcu_irq_enter(); + local_irq_enable(); + schedule(); + local_irq_disable(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; + finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) - smp_send_reschedule(n->cpu); - else if (swq_has_sleeper(&n->wq)) + if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } @@ -175,12 +163,13 @@ static void apf_task_wake_all(void) int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -221,46 +210,64 @@ again: n->cpu = smp_processor_id(); init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); - } else + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +noinstr u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - switch (kvm_read_and_reset_pf_reason()) { - default: - do_page_fault(regs, error_code, address); - break; + u32 reason = kvm_read_and_reset_apf_flags(); + bool rcu_exit; + + switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ - kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - break; case KVM_PV_REASON_PAGE_READY: - rcu_irq_enter(); - kvm_async_pf_task_wake((u32)address); - rcu_irq_exit(); break; + default: + return false; } + + rcu_exit = idtentry_enter_cond_rcu(regs); + instrumentation_begin(); + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { + kvm_async_pf_task_wake(token); + } + + instrumentation_end(); + idtentry_exit_cond_rcu(regs, rcu_exit); + return true; } -NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { @@ -306,11 +313,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; -#ifdef CONFIG_PREEMPTION - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); pa |= KVM_ASYNC_PF_ENABLED; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) @@ -318,12 +325,12 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -344,8 +351,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); } static void kvm_pv_guest_cpu_reboot(void *unused) @@ -592,12 +598,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu) } #endif -static void __init kvm_apf_trap_init(void) -{ - update_intr_gate(X86_TRAP_PF, async_page_fault); -} - - static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -632,8 +632,6 @@ static void __init kvm_guest_init(void) register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; @@ -649,6 +647,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + static_branch_enable(&kvm_async_pf_enabled); + #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 84c3ba32f211..8748321c4486 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -8,7 +8,7 @@ * * Lock order: * contex.ldt_usr_sem - * mmap_sem + * mmap_lock * context.lock */ diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 6a68e41206e7..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * livepatch.c - x86-specific Kernel Live Patching Core - */ - -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/livepatch.h> -#include <asm/text-patching.h> - -/* Apply per-object alternatives. Based on x86 module_finalize() */ -void arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ - int cnt; - struct klp_modinfo *info; - Elf_Shdr *s, *alt = NULL, *para = NULL; - void *aseg, *pseg; - const char *objname; - char sec_objname[MODULE_NAME_LEN]; - char secname[KSYM_NAME_LEN]; - - info = patch->mod->klp_info; - objname = obj->name ? obj->name : "vmlinux"; - - /* See livepatch core code for BUILD_BUG_ON() explanation */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); - - for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { - /* Apply per-object .klp.arch sections */ - cnt = sscanf(info->secstrings + s->sh_name, - ".klp.arch.%55[^.].%127s", - sec_objname, secname); - if (cnt != 2) - continue; - if (strcmp(sec_objname, objname)) - continue; - if (!strcmp(".altinstructions", secname)) - alt = s; - if (!strcmp(".parainstructions", secname)) - para = s; - } - - if (alt) { - aseg = (void *) alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - - if (para) { - pseg = (void *) para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } -} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 02bddfc122a4..64b00b0d7fe8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,7 +13,6 @@ #include <linux/gfp.h> #include <linux/io.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ad5cdd6a5f23..a29a44a98e5b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -19,7 +19,6 @@ #include <linux/efi.h> #include <asm/init.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d5c72cb877b3..34b153cbd4ac 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -18,10 +18,10 @@ #include <linux/gfp.h> #include <linux/jump_label.h> #include <linux/random.h> +#include <linux/memory.h> #include <asm/text-patching.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/unwind.h> @@ -126,11 +126,12 @@ int apply_relocate(Elf32_Shdr *sechdrs, return 0; } #else /*X86_64*/ -int apply_relocate_add(Elf64_Shdr *sechdrs, +static int __apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *me) + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) { unsigned int i; Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; @@ -162,19 +163,19 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_X86_64_64: if (*(u64 *)loc != 0) goto invalid_relocation; - *(u64 *)loc = val; + write(loc, &val, 8); break; case R_X86_64_32: if (*(u32 *)loc != 0) goto invalid_relocation; - *(u32 *)loc = val; + write(loc, &val, 4); if (val != *(u32 *)loc) goto overflow; break; case R_X86_64_32S: if (*(s32 *)loc != 0) goto invalid_relocation; - *(s32 *)loc = val; + write(loc, &val, 4); if ((s64)val != *(s32 *)loc) goto overflow; break; @@ -183,7 +184,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u32 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u32 *)loc = val; + write(loc, &val, 4); #if 0 if ((s64)val != *(s32 *)loc) goto overflow; @@ -193,7 +194,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u64 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u64 *)loc = val; + write(loc, &val, 8); break; default: pr_err("%s: Unknown rela relocation: %llu\n", @@ -215,6 +216,33 @@ overflow: me->name); return -ENOEXEC; } + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + int ret; + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) { + write = text_poke; + mutex_lock(&text_mutex); + } + + ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); + + if (!early) { + text_poke_sync(); + mutex_unlock(&text_mutex); + } + + return ret; +} + #endif int module_finalize(const Elf_Ehdr *hdr, diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6407ea21fa1b..3a98ff36f411 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -25,10 +25,6 @@ #include <linux/atomic.h> #include <linux/sched/clock.h> -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> @@ -307,7 +303,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static void default_do_nmi(struct pt_regs *regs) +static noinstr void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; @@ -333,6 +329,9 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); + instrumentation_begin(); + trace_hardirqs_off_finish(); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -346,7 +345,7 @@ static void default_do_nmi(struct pt_regs *regs) */ if (handled > 1) __this_cpu_write(swallow_nmi, true); - return; + goto out; } /* @@ -378,7 +377,7 @@ static void default_do_nmi(struct pt_regs *regs) #endif __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); - return; + goto out; } raw_spin_unlock(&nmi_reason_lock); @@ -416,8 +415,12 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_add(nmi_stats.swallow, 1); else unknown_nmi_error(reason, regs); + +out: + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); } -NOKPROBE_SYMBOL(default_do_nmi); /* * NMIs can page fault or hit breakpoints which will cause it to lose @@ -471,44 +474,9 @@ enum nmi_states { }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); +static DEFINE_PER_CPU(unsigned long, nmi_dr7); -#ifdef CONFIG_X86_64 -/* - * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without - * some care, the inner breakpoint will clobber the outer breakpoint's - * stack. - * - * If a breakpoint is being processed, and the debug stack is being - * used, if an NMI comes in and also hits a breakpoint, the stack - * pointer will be set to the same fixed address as the breakpoint that - * was interrupted, causing that stack to be corrupted. To handle this - * case, check if the stack that was interrupted is the debug stack, and - * if so, change the IDT so that new breakpoints will use the current - * stack and not switch to the fixed address. On return of the NMI, - * switch back to the original IDT. - */ -static DEFINE_PER_CPU(int, update_debug_stack); - -static bool notrace is_debug_stack(unsigned long addr) -{ - struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); - unsigned long top = CEA_ESTACK_TOP(cs, DB); - unsigned long bot = CEA_ESTACK_BOT(cs, DB1); - - if (__this_cpu_read(debug_stack_usage)) - return true; - /* - * Note, this covers the guard page between DB and DB1 as well to - * avoid two checks. But by all means @addr can never point into - * the guard page. - */ - return addr >= bot && addr < top; -} -NOKPROBE_SYMBOL(is_debug_stack); -#endif - -dotraplinkage notrace void -do_nmi(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_NMI(exc_nmi) { if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) return; @@ -521,18 +489,7 @@ do_nmi(struct pt_regs *regs, long error_code) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: -#ifdef CONFIG_X86_64 - /* - * If we interrupted a breakpoint, it is possible that - * the nmi handler will have breakpoints too. We need to - * change the IDT such that breakpoints that happen here - * continue to use the NMI stack. - */ - if (unlikely(is_debug_stack(regs->sp))) { - debug_stack_set_zero(); - this_cpu_write(update_debug_stack, 1); - } -#endif + this_cpu_write(nmi_dr7, local_db_save()); nmi_enter(); @@ -543,12 +500,7 @@ nmi_restart: nmi_exit(); -#ifdef CONFIG_X86_64 - if (unlikely(this_cpu_read(update_debug_stack))) { - debug_stack_reset(); - this_cpu_write(update_debug_stack, 0); - } -#endif + local_db_restore(this_cpu_read(nmi_dr7)); if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); @@ -558,7 +510,6 @@ nmi_restart: if (user_mode(regs)) mds_user_clear_cpu_buffers(); } -NOKPROBE_SYMBOL(do_nmi); void stop_nmi(void) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c131ba4e70ef..674a7d66d960 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -13,13 +13,13 @@ #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> +#include <linux/pgtable.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> -#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -160,25 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -static void native_flush_tlb(void) -{ - __native_flush_tlb(); -} - -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -static void native_flush_tlb_global(void) -{ - __native_flush_tlb_global(); -} - -static void native_flush_tlb_one_user(unsigned long addr) -{ - __native_flush_tlb_one_user(addr); -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -359,7 +340,7 @@ struct paravirt_patch_template pv_ops = { #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ - .mmu.flush_tlb_user = native_flush_tlb, + .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_others = native_flush_tlb_others, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..8e3d0347b664 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -96,7 +96,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) } /* - * Free current thread data structures etc.. + * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { @@ -104,7 +104,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) - io_bitmap_exit(); + io_bitmap_exit(tsk); free_vm86(t); @@ -191,7 +191,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(&tsk->thread.fpu); + fpu__clear_all(&tsk->thread.fpu); } void disable_TSC(void) @@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void) preempt_enable(); } +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 954b013cc585..acfd6d2a0cbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,7 +39,6 @@ #include <linux/kdebug.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> #include <asm/ldt.h> #include <asm/processor.h> #include <asm/fpu/internal.h> @@ -52,7 +51,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/proto.h> #include "process.h" diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5ef9d8f25b0e..9a97415b2139 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,7 +40,6 @@ #include <linux/ftrace.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include <asm/fpu/internal.h> #include <asm/mmu_context.h> @@ -52,7 +51,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f0e1ddbc2fd7..44130588987f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -28,7 +28,6 @@ #include <linux/nospec.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3ca43be4f9cf..e040ba6be27b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -11,13 +11,13 @@ #include <linux/tboot.h> #include <linux/delay.h> #include <linux/frame.h> +#include <linux/pgtable.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4b3fa6cd3106..a3767e74c758 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -237,6 +237,9 @@ static u64 __init get_ramdisk_image(void) ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) @@ -245,6 +248,9 @@ static u64 __init get_ramdisk_size(void) ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + return ramdisk_size; } diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..399f97abee02 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #ifdef CONFIG_X86_64 +#include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> #endif /* CONFIG_X86_64 */ @@ -511,6 +512,31 @@ Efault: } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } @@ -732,7 +758,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - fpu__clear(fpu); + fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b8d4e9c3c070..eff4ce3b10da 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -27,6 +27,7 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> @@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) /* * this function calls the 'stop' function on all other CPUs in the system. */ - -asmlinkage __visible void smp_reboot_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ipi_entering_ack_irq(); + ack_APIC_irq(); cpu_emergency_vmxoff(); stop_this_cpu(NULL); - irq_exit(); } static int register_stop_handler(void) @@ -221,47 +220,33 @@ static void native_stop_other_cpus(int wait) /* * Reschedule call back. KVM uses this interrupt to force a cpu out of - * guest mode + * guest mode. */ -__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); - kvm_set_cpu_l1tf_flush_l1d(); - - if (trace_resched_ipi_enabled()) { - /* - * scheduler_ipi() might call irq_enter() as well, but - * nested calls are fine. - */ - irq_enter(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - scheduler_ipi(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - irq_exit(); - return; - } scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); } -__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); } -__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); - exiting_irq(); } static int __init nonmi_ipi_setup(char *str) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe3ab9632f3b..ffbd9a3d78d8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -55,6 +55,7 @@ #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/numa.h> +#include <linux/pgtable.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -63,7 +64,6 @@ #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/mwait.h> @@ -147,7 +147,7 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } -static void init_freq_invariance(void); +static void init_freq_invariance(bool secondary); /* * Report back to the Boot Processor during boot time or to the caller processor @@ -185,7 +185,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(); + init_freq_invariance(true); /* * Get our bogomips. @@ -266,6 +266,14 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + + /* + * Prevent tail call to cpu_startup_entry() because the stack protector + * guard has been changed a couple of function calls up, in + * boot_init_stack_canary() and must not be checked before tail calling + * another function. + */ + prevent_tail_call_optimization(); } /** @@ -1341,7 +1349,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - init_freq_invariance(); + init_freq_invariance(false); smp_sanity_check(); switch (apic_intr_mode) { @@ -1376,12 +1384,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) speculative_store_bypass_ht_init(); } -void arch_enable_nonboot_cpus_begin(void) +void arch_thaw_secondary_cpus_begin(void) { set_mtrr_aps_delayed_init(); } -void arch_enable_nonboot_cpus_end(void) +void arch_thaw_secondary_cpus_end(void) { mtrr_aps_init(); } @@ -1849,24 +1857,25 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#define ICPU(model) \ - {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_XEON_PHI_KNL), - ICPU(INTEL_FAM6_XEON_PHI_KNM), + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_SKYLAKE_X), + X86_MATCH(SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_ATOM_GOLDMONT), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; @@ -1877,9 +1886,6 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - if (!x86_match_cpu(has_knl_turbo_ratio_limits)) - return false; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; @@ -1945,18 +1951,23 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { + u64 msr; int err; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq); + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */ + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ return true; } @@ -1972,7 +1983,8 @@ static bool intel_set_max_freq_ratio(void) skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; - if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_skx_turbo_ratio_limits) && @@ -1985,13 +1997,22 @@ static bool intel_set_max_freq_ratio(void) return false; out: + /* + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + */ + if (!base_freq) { + pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n"); + return false; + } + arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); arch_set_max_freq_ratio(turbo_disabled()); return true; } -static void init_counter_refs(void *arg) +static void init_counter_refs(void) { u64 aperf, mperf; @@ -2002,18 +2023,25 @@ static void init_counter_refs(void *arg) this_cpu_write(arch_prev_mperf, mperf); } -static void init_freq_invariance(void) +static void init_freq_invariance(bool secondary) { bool ret = false; - if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; + if (secondary) { + if (static_branch_likely(&arch_scale_freq_key)) { + init_counter_refs(); + } + return; + } + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ret = intel_set_max_freq_ratio(); if (ret) { - on_each_cpu(init_counter_refs, NULL, 1); + init_counter_refs(); static_branch_enable(&arch_scale_freq_key); } else { pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index ab03fede1422..f8d65c99feb8 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -135,26 +135,30 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) typeof(ubuf->st_gid) gid = 0; SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); - if (!access_ok(ubuf, sizeof(struct stat64)) || - __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || - __put_user(stat->ino, &ubuf->__st_ino) || - __put_user(stat->ino, &ubuf->st_ino) || - __put_user(stat->mode, &ubuf->st_mode) || - __put_user(stat->nlink, &ubuf->st_nlink) || - __put_user(uid, &ubuf->st_uid) || - __put_user(gid, &ubuf->st_gid) || - __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || - __put_user(stat->size, &ubuf->st_size) || - __put_user(stat->atime.tv_sec, &ubuf->st_atime) || - __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || - __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || - __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user(stat->blksize, &ubuf->st_blksize) || - __put_user(stat->blocks, &ubuf->st_blocks)) + if (!user_write_access_begin(ubuf, sizeof(struct stat64))) return -EFAULT; + unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault); + unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault); + unsafe_put_user(stat->ino, &ubuf->st_ino, Efault); + unsafe_put_user(stat->mode, &ubuf->st_mode, Efault); + unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault); + unsafe_put_user(uid, &ubuf->st_uid, Efault); + unsafe_put_user(gid, &ubuf->st_gid, Efault); + unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault); + unsafe_put_user(stat->size, &ubuf->st_size, Efault); + unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault); + unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault); + unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault); + unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault); + unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault); + unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault); + unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault); + unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault); + user_access_end(); return 0; +Efault: + user_write_access_end(); + return -EFAULT; } COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..992fb1415c0f 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -23,7 +23,6 @@ #include <asm/realmode.h> #include <asm/processor.h> #include <asm/bootparam.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/swiotlb.h> #include <asm/fixmap.h> @@ -35,8 +34,7 @@ #include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ -struct tboot *tboot __read_mostly; -EXPORT_SYMBOL(tboot); +static struct tboot *tboot __read_mostly; /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -46,6 +44,11 @@ EXPORT_SYMBOL(tboot); static u8 tboot_uuid[16] __initdata = TBOOT_UUID; +bool tboot_enabled(void) +{ + return tboot != NULL; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -90,7 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), }; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..371a6b348e44 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -103,6 +103,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 496748ed266a..fcfc077afe2d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -25,20 +25,3 @@ void trace_pagefault_unreg(void) { static_branch_dec(&trace_pagefault_key); } - -#ifdef CONFIG_SMP - -DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); - -int trace_resched_ipi_reg(void) -{ - static_branch_inc(&trace_resched_ipi_key); - return 0; -} - -void trace_resched_ipi_unreg(void) -{ - static_branch_dec(&trace_resched_ipi_key); -} - -#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..7febae381b91 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/hardirq.h> +#include <linux/atomic.h> + #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> -#include <linux/atomic.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> @@ -82,78 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) -{ - preempt_disable(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; @@ -215,7 +145,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. + * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -271,30 +201,78 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); + cond_local_irq_disable(regs); } } -#define IP ((void __user *)uprobe_get_trap_addr(regs)) -#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ +/* + * Posix requires to provide the address of the faulting instruction for + * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. + * + * This address is usually regs->ip, but when an uprobe moved the code out + * of line then regs->ip points to the XOL code which would confuse + * anything which analyzes the fault address vs. the unmodified binary. If + * a trap happened in XOL code then uprobe maps regs->ip back to the + * original instruction address. + */ +static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) +{ + return (void __user *)uprobe_get_trap_addr(regs); } -DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error) -DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) -DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) -DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -#undef IP +DEFINE_IDTENTRY(exc_divide_error) +{ + do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, + FPE_INTDIV, error_get_trap_addr(regs)); +} -dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_overflow) { - char *str = "alignment check"; + do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); +} - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); +#ifdef CONFIG_X86_F00F_BUG +void handle_invalid_op(struct pt_regs *regs) +#else +static inline void handle_invalid_op(struct pt_regs *regs) +#endif +{ + do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, + ILL_ILLOPN, error_get_trap_addr(regs)); +} + +DEFINE_IDTENTRY(exc_invalid_op) +{ + handle_invalid_op(regs); +} + +DEFINE_IDTENTRY(exc_coproc_segment_overrun) +{ + do_error_trap(regs, 0, "coprocessor segment overrun", + X86_TRAP_OLD_MF, SIGFPE, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) +{ + do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, + 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) +{ + do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, + SIGBUS, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) +{ + do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, + 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) +{ + char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; @@ -326,7 +304,6 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * @@ -342,12 +319,19 @@ __visible void __noreturn handle_stack_overflow(const char *message, * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. + * + * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs + * to be read before doing anything else. */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) +DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long address = read_cr2(); +#endif + #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -363,13 +347,14 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the @@ -377,7 +362,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * in gpregs->ss through gpregs->ip. * */ - memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->ip = p[0]; + gpregs->cs = p[1]; + gpregs->flags = p[2]; + gpregs->sp = p[3]; + gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* @@ -391,14 +380,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ - regs->ip = (unsigned long)general_protection; + regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif - ist_enter(regs); + nmi_enter(); + instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -442,28 +432,31 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) - handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); + if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { + handle_stack_overflow("kernel stack overflow (double-fault)", + regs, address); + } #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); + instrumentation_end(); } -#endif -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_bounds) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, "bounds", regs, error_code, + if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) - die("bounds", regs, error_code); + die("bounds", regs, 0); + + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + cond_local_irq_disable(regs); } enum kernel_gp_hint { @@ -510,7 +503,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, #define GPFSTR "general protection fault" -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; @@ -518,17 +511,17 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) unsigned long gp_addr; int ret; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) - return; + goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + local_irq_disable(); return; } @@ -540,12 +533,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); - - return; + goto exit; } if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -557,11 +549,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, X86_TRAP_GP)) - return; + goto exit; ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); if (ret == NOTIFY_STOP) - return; + goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); @@ -583,55 +575,74 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) die_addr(desc, regs, error_code, gp_addr); +exit: + cond_local_irq_disable(regs); } -NOKPROBE_SYMBOL(do_general_protection); -dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +static bool do_int3(struct pt_regs *regs) { - if (poke_int3_handler(regs)) - return; - - /* - * Unlike any other non-IST entry, we can be called from a kprobe in - * non-CONTEXT_KERNEL kernel mode or even during context tracking - * state changes. Make sure that we wake up RCU even if we're coming - * from kernel code. - * - * This means that we can't schedule even if we came from a - * preemptible kernel context. That's okay. - */ - if (!user_mode(regs)) { - rcu_nmi_enter(); - preempt_disable(); - } - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) - goto exit; + return true; #endif + res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); - if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; + return res == NOTIFY_STOP; +} + +static void do_int3_user(struct pt_regs *regs) +{ + if (do_int3(regs)) + return; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); +} -exit: - if (!user_mode(regs)) { - preempt_enable_no_resched(); - rcu_nmi_exit(); +DEFINE_IDTENTRY_RAW(exc_int3) +{ + /* + * poke_int3_handler() is completely self contained code; it does (and + * must) *NOT* call out to anything, lest it hits upon yet another + * INT3. + */ + if (poke_int3_handler(regs)) + return; + + /* + * idtentry_enter_user() uses static_branch_{,un}likely() and therefore + * can trigger INT3, hence poke_int3_handler() must be done + * before. If the entry came from kernel mode, then use nmi_enter() + * because the INT3 could have been hit in any context including + * NMI. + */ + if (user_mode(regs)) { + idtentry_enter_user(regs); + instrumentation_begin(); + do_int3_user(regs); + instrumentation_end(); + idtentry_exit_user(regs); + } else { + nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + if (!do_int3(regs)) + die("int3", regs, 0); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); } } -NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* @@ -639,21 +650,20 @@ NOKPROBE_SYMBOL(do_int3); * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ -asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; } -NOKPROBE_SYMBOL(sync_regs); struct bad_iret_stack { void *error_entry_ret; struct pt_regs regs; }; -asmlinkage __visible notrace +asmlinkage __visible noinstr struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) { /* @@ -664,19 +674,21 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + struct bad_iret_stack tmp, *new_stack = + (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + /* Copy the IRET target to the temporary storage. */ + memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + + /* Update the entry stack */ + memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } -NOKPROBE_SYMBOL(fixup_bad_iret); #endif static bool is_sysenter_singlestep(struct pt_regs *regs) @@ -702,6 +714,43 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } +static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) +{ + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + *dr7 = local_db_save(); + + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + get_debugreg(*dr6, 6); + set_debugreg(0, 6); + /* Filter out all the reserved bits which are preset to 1 */ + *dr6 &= ~DR6_RESERVED; +} + +static __always_inline void debug_exit(unsigned long dr7) +{ + local_db_restore(dr7); +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -726,86 +775,54 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) * * May run on IST stack. */ -dotraplinkage void do_debug(struct pt_regs *regs, long error_code) +static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user) { struct task_struct *tsk = current; - int user_icebp = 0; - unsigned long dr6; + bool user_icebp; int si_code; - ist_enter(regs); - - get_debugreg(dr6, 6); - /* - * The Intel SDM says: - * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. - * - * Keep it simple: clear DR6 immediately. - */ - set_debugreg(0, 6); - - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; - /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + clear_thread_flag(TIF_BLOCKSTEP); - if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && - is_sysenter_singlestep(regs))) { - dr6 &= ~DR_STEP; - if (!dr6) - goto exit; - /* - * else we might have gotten a single-step trap and hit a - * watchpoint at the same time, in which case we should fall - * through and handle the watchpoint. - */ - } + /* + * If DR6 is zero, no point in trying to handle it. The kernel is + * not using INT1. + */ + if (!user && !dr6) + return; /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ - if (!dr6 && user_mode(regs)) - user_icebp = 1; + user_icebp = user && !dr6; /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; #ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) - goto exit; + if (kprobe_debug_handler(regs)) { + return; + } #endif - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, - SIGTRAP) == NOTIFY_STOP) - goto exit; - - /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. - */ - debug_stack_usage_inc(); + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, + SIGTRAP) == NOTIFY_STOP) { + return; + } /* It's safe to allow irq's after DR6 has been saved */ cond_local_irq_enable(regs); if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, - X86_TRAP_DB); - cond_local_irq_disable(regs); - debug_stack_usage_dec(); - goto exit; + handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, + X86_TRAP_DB); + goto out; } if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { @@ -819,23 +836,91 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; } + si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, error_code, si_code); + send_sigtrap(regs, 0, si_code); + +out: cond_local_irq_disable(regs); - debug_stack_usage_dec(); +} -exit: - ist_exit(regs); +static __always_inline void exc_debug_kernel(struct pt_regs *regs, + unsigned long dr6) +{ + nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + + /* + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a + * watchpoint at the same time then that will still be handled. + */ + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + dr6 &= ~DR_STEP; + + handle_debug(regs, dr6, false); + + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); +} + +static __always_inline void exc_debug_user(struct pt_regs *regs, + unsigned long dr6) +{ + idtentry_enter_user(regs); + instrumentation_begin(); + + handle_debug(regs, dr6, true); + instrumentation_end(); + idtentry_exit_user(regs); +} + +#ifdef CONFIG_X86_64 +/* IST stack entry */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_kernel(regs, dr6); + debug_exit(dr7); } -NOKPROBE_SYMBOL(do_debug); + +/* User entry, runs on regular task stack */ +DEFINE_IDTENTRY_DEBUG_USER(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_user(regs, dr6); + debug_exit(dr7); +} +#else +/* 32 bit does not have separate entry points. */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); + + debug_exit(dr7); +} +#endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -static void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; @@ -846,16 +931,16 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr, error_code, 0)) - return; + if (fixup_exception(regs, trapnr, 0, 0)) + goto exit; - task->thread.error_code = error_code; + task->thread.error_code = 0; task->thread.trap_nr = trapnr; - if (notify_die(DIE_TRAP, str, regs, error_code, - trapnr, SIGFPE) != NOTIFY_STOP) - die(str, regs, error_code); - return; + if (notify_die(DIE_TRAP, str, regs, 0, trapnr, + SIGFPE) != NOTIFY_STOP) + die(str, regs, 0); + goto exit; } /* @@ -864,32 +949,37 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) fpu__save(fpu); task->thread.trap_nr = trapnr; - task->thread.error_code = error_code; + task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); +exit: + cond_local_irq_disable(regs); } -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_MF); + math_error(regs, X86_TRAP_MF); } -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_simd_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_XF); + if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { + /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ + if (!static_cpu_has(X86_FEATURE_XMM)) { + __exc_general_protection(regs, 0); + return; + } + } + math_error(regs, X86_TRAP_XF); } -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: @@ -912,13 +1002,10 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) */ } -dotraplinkage void -do_device_not_available(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; @@ -927,6 +1014,8 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + + cond_local_irq_disable(regs); return; } #endif @@ -941,22 +1030,20 @@ do_device_not_available(struct pt_regs *regs, long error_code) * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ - die("unexpected #NM exception", regs, error_code); + die("unexpected #NM exception", regs, 0); } } -NOKPROBE_SYMBOL(do_device_not_available); #ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_SW(iret_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); - - if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } + local_irq_disable(); } #endif @@ -968,22 +1055,9 @@ void __init trap_init(void) idt_setup_traps(); /* - * Set the IDT descriptor to a fixed read-only location, so that the - * "sidt" instruction will not leak the location of the kernel, and - * to defend the IDT against arbitrary memory write vulnerabilities. - * It will be reloaded in cpu_init() */ - cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), - PAGE_KERNEL_RO); - idt_descr.address = CPU_ENTRY_AREA_RO_IDT; - - /* * Should be a barrier for any external CPU state: */ cpu_init(); idt_setup_ist_traps(); - - x86_init.irqs.trap_init(); - - idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fdd4c1078632..49d925043171 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz); * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -59,6 +60,12 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); +static int __init tsc_early_khz_setup(char *buf) +{ + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); + __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); + if (tsc_early_khz) + tsc_khz = tsc_early_khz; + else + tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 4d732a444711..8d5cbe1bbb3b 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -81,7 +81,7 @@ #define UMIP_INST_SLDT 3 /* 0F 00 /0 */ #define UMIP_INST_STR 4 /* 0F 00 /1 */ -const char * const umip_insns[5] = { +static const char * const umip_insns[5] = { [UMIP_INST_SGDT] = "SGDT", [UMIP_INST_SIDT] = "SIDT", [UMIP_INST_SMSW] = "SMSW", diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a224b5ab103f..722a85f3b2dd 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; - if (addr >= __entry_text_start && addr < __entry_text_end) - return true; - - if (addr >= __irqentry_text_start && addr < __irqentry_text_end) - return true; - - return false; + return addr >= __entry_text_start && addr < __entry_text_end; } static inline unsigned long *last_frame(struct unwind_state *state) @@ -344,6 +338,9 @@ bad_address: if (IS_ENABLED(CONFIG_X86_32)) goto the_end; + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index e9cc182aa97e..7f969b2d240f 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -8,19 +8,21 @@ #include <asm/orc_lookup.h> #define orc_warn(fmt, ...) \ - printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + +#define orc_warn_current(args...) \ +({ \ + if (state->task == current) \ + orc_warn(args); \ +}) extern int __start_orc_unwind_ip[]; extern int __stop_orc_unwind_ip[]; extern struct orc_entry __start_orc_unwind[]; extern struct orc_entry __stop_orc_unwind[]; -static DEFINE_MUTEX(sort_mutex); -int *cur_orc_ip_table = __start_orc_unwind_ip; -struct orc_entry *cur_orc_table = __start_orc_unwind; - -unsigned int lookup_num_blocks; -bool orc_init; +static bool orc_init __ro_after_init; +static unsigned int lookup_num_blocks __ro_after_init; static inline unsigned long orc_ip(const int *ip) { @@ -142,9 +144,6 @@ static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; - if (!orc_init) - return NULL; - if (ip == 0) return &null_orc_entry; @@ -189,6 +188,10 @@ static struct orc_entry *orc_find(unsigned long ip) #ifdef CONFIG_MODULES +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; + static void orc_sort_swap(void *_a, void *_b, int size) { struct orc_entry *orc_a, *orc_b; @@ -317,12 +320,19 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) { + struct task_struct *task = state->task; + if (unwind_done(state)) return NULL; if (state->regs) return &state->regs->ip; + if (task != current && state->sp == task->thread.sp) { + struct inactive_task_frame *frame = (void *)task->thread.sp; + return &frame->ret_addr; + } + if (state->sp) return (unsigned long *)state->sp - 1; @@ -381,9 +391,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr return true; } +/* + * If state->regs is non-NULL, and points to a full pt_regs, just get the reg + * value from state->regs. + * + * Otherwise, if state->regs just points to IRET regs, and the previous frame + * had full regs, it's safe to get the value from the previous regs. This can + * happen when early/late IRQ entry code gets interrupted by an NMI. + */ +static bool get_reg(struct unwind_state *state, unsigned int reg_off, + unsigned long *val) +{ + unsigned int reg = reg_off/8; + + if (!state->regs) + return false; + + if (state->full_regs) { + *val = ((unsigned long *)state->regs)[reg]; + return true; + } + + if (state->prev_regs) { + *val = ((unsigned long *)state->prev_regs)[reg]; + return true; + } + + return false; +} + bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; + unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -445,43 +484,39 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_R10: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R10 at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) { + orc_warn_current("missing R10 value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->r10; break; case ORC_REG_R13: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R13 at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) { + orc_warn_current("missing R13 value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->r13; break; case ORC_REG_DI: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DI at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) { + orc_warn_current("missing RDI value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->di; break; case ORC_REG_DX: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DX at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) { + orc_warn_current("missing DX value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->dx; break; default: - orc_warn("unknown SP base reg %d for ip %pB\n", + orc_warn("unknown SP base reg %d at %pB\n", orc->sp_reg, (void *)state->ip); goto err; } @@ -504,44 +539,48 @@ bool unwind_next_frame(struct unwind_state *state) state->sp = sp; state->regs = NULL; + state->prev_regs = NULL; state->signal = false; break; case ORC_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access registers at %pB\n", + (void *)orig_ip); goto err; } state->regs = (struct pt_regs *)sp; + state->prev_regs = NULL; state->full_regs = true; state->signal = true; break; case ORC_TYPE_REGS_IRET: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference iret registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access iret registers at %pB\n", + (void *)orig_ip); goto err; } + if (state->full_regs) + state->prev_regs = state->regs; state->regs = (void *)sp - IRET_FRAME_OFFSET; state->full_regs = false; state->signal = true; break; default: - orc_warn("unknown .orc_unwind entry type %d for ip %pB\n", + orc_warn("unknown .orc_unwind entry type %d at %pB\n", orc->type, (void *)orig_ip); - break; + goto err; } /* Find BP: */ switch (orc->bp_reg) { case ORC_REG_UNDEFINED: - if (state->regs && state->full_regs) - state->bp = state->regs->bp; + if (get_reg(state, offsetof(struct pt_regs, bp), &tmp)) + state->bp = tmp; break; case ORC_REG_PREV_SP: @@ -564,8 +603,8 @@ bool unwind_next_frame(struct unwind_state *state) if (state->stack_info.type == prev_type && on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && state->sp <= prev_sp) { - orc_warn("stack going in the wrong direction? ip=%pB\n", - (void *)orig_ip); + orc_warn_current("stack going in the wrong direction? at %pB\n", + (void *)orig_ip); goto err; } @@ -588,17 +627,20 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; + if (!orc_init) + goto err; + /* * Refuse to unwind the stack of a task while it's executing on another * CPU. This check is racy, but that's ok: the unwinder has other * checks to prevent it from going off the rails. */ if (task_on_another_cpu(task)) - goto done; + goto err; if (regs) { if (user_mode(regs)) - goto done; + goto the_end; state->ip = regs->ip; state->sp = regs->sp; @@ -631,6 +673,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, * generate some kind of backtrace if this happens. */ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + state->error = true; if (get_stack_info(next_page, state->task, &state->stack_info, &state->stack_mask)) return; @@ -651,13 +694,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp <= (unsigned long)first_frame)) + state->sp < (unsigned long)first_frame)) unwind_next_frame(state); return; -done: +err: + state->error = true; +the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; - return; } EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 47a8676c7395..764573de3996 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -171,7 +171,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_t *pte; int i; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -197,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1bf7e312361f..b4c6b6f35548 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -134,7 +134,6 @@ SECTIONS KPROBES_TEXT ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT - IRQENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 85f1a90c55cd..123f1c1f1788 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, - .trap_init = x86_init_noop, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a789759b7261..4a3081e9f4b5 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,6 +3,10 @@ ccflags-y += -Iarch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror +ifeq ($(CONFIG_FRAME_POINTER),y) +OBJECT_FILES_NON_STANDARD_vmenter.o := y +endif + KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 901cd1fdecd9..253b8e875ccd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -86,12 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - vcpu->arch.guest_xstate_size = best->ebx = - xstate_required_size(vcpu->arch.xcr0, false); + best->ebx = xstate_required_size(vcpu->arch.xcr0, false); } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); @@ -124,8 +122,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } - /* Update physical-address width */ + /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); @@ -297,7 +296,7 @@ void kvm_set_cpu_caps(void) F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | @@ -712,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | - (1 << KVM_FEATURE_PV_SCHED_YIELD); + (1 << KVM_FEATURE_PV_SCHED_YIELD) | + (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); @@ -728,6 +728,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000006: + /* L2 cache and TLB: pass through host info. */ + break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ entry->edx &= (1 << 8); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 63a70f6a3df3..05434cd9342f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -303,4 +303,9 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bddaba9c68dd..de5476f8683e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5798,6 +5798,8 @@ writeback: } ctxt->eip = ctxt->_eip; + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; done: if (rc == X86EMUL_PROPAGATE_FAULT) { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index bcefa9d4e57e..af9cdb426dd2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -21,6 +21,7 @@ #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" #include <linux/cpu.h> @@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, + HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, + 0); + if (!entry) + return false; + + return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, + *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -900,7 +1020,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) + struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; @@ -941,7 +1061,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; @@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1129,7 +1252,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ - if (__clear_user((void __user *)addr, sizeof(u32))) + if (__put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); @@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; @@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) int r; mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else @@ -1425,9 +1552,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, - KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_mask, &hv_vcpu->tlb_flush); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, + NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ @@ -1530,7 +1656,7 @@ ret_success: bool kvm_hv_hypercall_enabled(struct kvm *kvm) { - return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; + return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) @@ -1709,6 +1835,34 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); break; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + if (unlikely(fast)) { + ret = HV_STATUS_INVALID_PARAMETER; + break; + } + fallthrough; + case HVCALL_RESET_DEBUG_SESSION: { + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu)) { + ret = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { + ret = HV_STATUS_OPERATION_DENIED; + break; + } + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = param; + vcpu->run->hyperv.u.hcall.params[0] = ingpa; + vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->arch.complete_userspace_io = + kvm_hv_hypercall_complete_userspace; + return 0; + } default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; @@ -1796,12 +1950,15 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); - if (kvm_x86_ops.nested_get_evmcs_version) - evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu); + if (kvm_x86_ops.nested_ops->get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) @@ -1821,7 +1978,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; @@ -1860,6 +2017,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->ebx |= HV_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + /* * Direct Synthetic timers only make sense with in-kernel * LAPIC @@ -1903,6 +2064,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + break; + default: break; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 757cb578101c..e68c6c2e9649 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -23,6 +23,33 @@ #include <linux/kvm_host.h> +/* + * The #defines related to the synthetic debugger are required by KDNet, but + * they are not documented in the Hyper-V TLFS because the synthetic debugger + * functionality has been deprecated and is subject to removal in future + * versions of Windows. + */ +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +/* + * Hyper-V synthetic debugger platform capabilities + * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits. + */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1) + +/* Hyper-V Synthetic debug options MSR */ +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */ +#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2) + static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) { return &vcpu->arch.hyperv; @@ -46,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); } +static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +{ + return &vcpu->kvm->arch.hyperv.hv_syndbg; +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 750ff0b29404..d057376bd3d3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -225,12 +225,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, } /* - * AMD SVM AVIC accelerate EOI write and do not trap, - * in-kernel IOAPIC will not be able to receive the EOI. - * In this case, we do lazy update of the pending EOI when - * trying to set IOAPIC irq. + * AMD SVM AVIC accelerate EOI write iff the interrupt is edge + * triggered, in which case the in-kernel IOAPIC will not be able + * to receive the EOI. In this case, we do a lazy update of the + * pending EOI when trying to set IOAPIC irq. */ - if (kvm_apicv_activated(ioapic->kvm)) + if (edge && kvm_apicv_activated(ioapic->kvm)) ioapic_lazy_update_eoi(ioapic, irq); /* diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 2fb2e3c80724..660401700075 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -3,8 +3,8 @@ #define __KVM_IO_APIC_H #include <linux/kvm_host.h> - #include <kvm/iodev.h> +#include "irq.h" struct kvm; struct kvm_vcpu; @@ -108,11 +108,7 @@ do { \ static inline int ioapic_in_kernel(struct kvm *kvm) { - int mode = kvm->arch.irqchip_mode; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_KERNEL; + return irqchip_kernel(kvm); } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index e330e7d125f7..99d118ffc67d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -83,6 +83,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr); /* * check if there is pending interrupt without @@ -159,6 +160,8 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); + if (kvm_x86_ops.migrate_timers) + kvm_x86_ops.migrate_timers(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f173ab6b407e..9b64abf9b3f1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -16,7 +16,6 @@ #include <linux/spinlock.h> #include <kvm/iodev.h> -#include "ioapic.h" #include "lapic.h" #define PIC_NUM_PINS 16 @@ -66,15 +65,6 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline int pic_in_kernel(struct kvm *kvm) -{ - int mode = kvm->arch.irqchip_mode; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); - return mode == KVM_IRQCHIP_KERNEL; -} - static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; @@ -93,6 +83,11 @@ static inline int irqchip_kernel(struct kvm *kvm) return mode == KVM_IRQCHIP_KERNEL; } +static inline int pic_in_kernel(struct kvm *kvm) +{ + return irqchip_kernel(kvm); +} + static inline int irqchip_in_kernel(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 62558b9bdda7..ff2d0e9ca3bc 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -116,8 +116,9 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; - if (tmask & vcpu->arch.cr0_guest_owned_bits) - kvm_x86_ops.decache_cr0_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr0_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -129,8 +130,9 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; - if (tmask & vcpu->arch.cr4_guest_owned_bits) - kvm_x86_ops.decache_cr4_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr4_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9af25c97612a..34a7e0533dad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,6 +36,7 @@ #include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" +#include "ioapic.h" #include "trace.h" #include "x86.h" #include "cpuid.h" @@ -110,11 +111,18 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) return apic->vcpu->vcpu_id; } -bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) +static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); } -EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt); + +bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops.set_hv_timer + && !(kvm_mwait_in_guest(vcpu->kvm) || + kvm_can_post_timer_interrupt(vcpu)); +} +EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer); static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { @@ -1593,7 +1601,7 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) } } -static void apic_timer_expired(struct kvm_lapic *apic) +static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_timer *ktimer = &apic->lapic_timer; @@ -1604,6 +1612,12 @@ static void apic_timer_expired(struct kvm_lapic *apic) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; + if (!from_timer_fn && vcpu->arch.apicv_active) { + WARN_ON(kvm_get_running_vcpu() != vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { if (apic->lapic_timer.timer_advance_ns) __kvm_wait_lapic_expire(vcpu); @@ -1643,18 +1657,23 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD); } else - apic_timer_expired(apic); + apic_timer_expired(apic, false); local_irq_restore(flags); } +static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) +{ + return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count; +} + static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) { ktime_t now, remaining; u64 ns_remaining_old, ns_remaining_new; - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; + apic->lapic_timer.period = + tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); limit_periodic_timer_frequency(apic); now = ktime_get(); @@ -1672,14 +1691,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); } -static bool set_target_expiration(struct kvm_lapic *apic) +static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) { ktime_t now; u64 tscl = rdtsc(); + s64 deadline; now = ktime_get(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; + apic->lapic_timer.period = + tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); if (!apic->lapic_timer.period) { apic->lapic_timer.tscdeadline = 0; @@ -1687,10 +1707,32 @@ static bool set_target_expiration(struct kvm_lapic *apic) } limit_periodic_timer_frequency(apic); + deadline = apic->lapic_timer.period; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + if (unlikely(count_reg != APIC_TMICT)) { + deadline = tmict_to_ns(apic, + kvm_lapic_get_reg(apic, count_reg)); + if (unlikely(deadline <= 0)) + deadline = apic->lapic_timer.period; + else if (unlikely(deadline > apic->lapic_timer.period)) { + pr_info_ratelimited( + "kvm: vcpu %i: requested lapic timer restore with " + "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " + "Using initial count to start timer.\n", + apic->vcpu->vcpu_id, + count_reg, + kvm_lapic_get_reg(apic, count_reg), + deadline, apic->lapic_timer.period); + kvm_lapic_set_reg(apic, count_reg, 0); + deadline = apic->lapic_timer.period; + } + } + } apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + - nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); - apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + nsec_to_cycles(apic->vcpu, deadline); + apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline); return true; } @@ -1723,7 +1765,7 @@ static void start_sw_period(struct kvm_lapic *apic) if (ktime_after(ktime_get(), apic->lapic_timer.target_expiration)) { - apic_timer_expired(apic); + apic_timer_expired(apic, false); if (apic_lvtt_oneshot(apic)) return; @@ -1760,7 +1802,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) bool expired; WARN_ON(preemptible()); - if (!kvm_x86_ops.set_hv_timer) + if (!kvm_can_use_hv_timer(vcpu)) return false; if (!ktimer->tscdeadline) @@ -1785,7 +1827,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (atomic_read(&ktimer->pending)) { cancel_hv_timer(apic); } else if (expired) { - apic_timer_expired(apic); + apic_timer_expired(apic, false); cancel_hv_timer(apic); } } @@ -1833,9 +1875,9 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(swait_active(&vcpu->wq)); + WARN_ON(rcuwait_active(&vcpu->wait)); cancel_hv_timer(apic); - apic_timer_expired(apic); + apic_timer_expired(apic, false); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); @@ -1872,17 +1914,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) restart_apic_timer(apic); } -static void start_apic_timer(struct kvm_lapic *apic) +static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg) { atomic_set(&apic->lapic_timer.pending, 0); if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - && !set_target_expiration(apic)) + && !set_target_expiration(apic, count_reg)) return; restart_apic_timer(apic); } +static void start_apic_timer(struct kvm_lapic *apic) +{ + __start_apic_timer(apic, APIC_TMICT); +} + static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val); @@ -2336,7 +2383,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); - apic_timer_expired(apic); + apic_timer_expired(apic, true); if (lapic_is_periodic(apic)) { advance_periodic_target_expiration(apic); @@ -2493,6 +2540,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + + /* + * Get calculated timer current count for remaining timer period (if + * any) and store it in the returned register set. + */ + __kvm_lapic_set_reg(s->regs, APIC_TMCCT, + __apic_read(vcpu->arch.apic, APIC_TMCCT)); + return kvm_apic_state_fixup(vcpu, s, false); } @@ -2520,7 +2575,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); - start_apic_timer(apic); + __start_apic_timer(apic, APIC_TMCCT); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a0ffb4331418..754f29beb83e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -161,9 +161,14 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off) return *((u32 *) (apic->regs + reg_off)); } +static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) +{ + *((u32 *) (regs + reg_off)) = val; +} + static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { - *((u32 *) (apic->regs + reg_off)) = val; + __kvm_lapic_set_reg(apic->regs, reg_off, val); } extern struct static_key kvm_no_apic_vcpu; @@ -245,7 +250,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); -bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu); +bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 8a3b1bce722a..0ad06bfe2c2c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,13 +51,13 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..fdd05c233308 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -16,6 +16,7 @@ */ #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" @@ -78,6 +79,9 @@ module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); +static bool __read_mostly force_flush_and_sync_on_reuse; +module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -244,7 +248,6 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; @@ -331,19 +334,19 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - BUG_ON((mmio_mask & mmio_value) != mmio_value); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_value; + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -566,7 +569,6 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_dirty_mask = 0; shadow_nx_mask = 0; shadow_x_mask = 0; - shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; @@ -583,16 +585,15 @@ static void kvm_mmu_reset_all_pte_masks(void) * the most significant bits of legal physical address space. */ shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_cache_bits; - if (boot_cpu_data.x86_cache_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) { + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_cache_bits - 1); - low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; - } else - WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF)); + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); @@ -620,7 +621,7 @@ static int is_large_pte(u64 pte) static int is_last_spte(u64 pte, int level) { - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return 1; if (is_large_pte(pte)) return 1; @@ -1196,7 +1197,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, struct kvm_lpage_info *linfo; int i; - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; WARN_ON(linfo->disallow_lpage < 0); @@ -1225,7 +1226,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1253,7 +1254,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1398,7 +1399,7 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, @@ -1529,8 +1530,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == - PT_PAGE_TABLE_LEVEL); + WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1682,7 +1682,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1708,7 +1708,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ @@ -1760,7 +1760,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } @@ -1948,8 +1948,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, + for_each_slot_rmap_range(memslot, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, &iterator) ret |= handler(kvm, iterator.rmap, memslot, @@ -2153,10 +2153,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) -{ -} - static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) @@ -2313,7 +2309,7 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, return; if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } #ifdef CONFIG_KVM_MMU_AUDIT @@ -2347,7 +2343,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, if (!s->unsync) continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(s->role.level != PG_LEVEL_4K); ret |= kvm_sync_page(vcpu, s, invalid_list); } @@ -2376,7 +2372,7 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, int level = sp->role.level; parents->idx[level-1] = idx; - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; @@ -2398,7 +2394,7 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PT_PAGE_TABLE_LEVEL); + WARN_ON(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2520,11 +2516,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } if (sp->unsync_children) - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); @@ -2546,11 +2542,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * be inconsistent with guest page table. */ account_shadowed(vcpu->kvm, sp); - if (level == PT_PAGE_TABLE_LEVEL && - rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) + if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } clear_page(sp->spt); @@ -2601,7 +2596,7 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { - if (iterator->level < PT_PAGE_TABLE_LEVEL) + if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); @@ -2722,7 +2717,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct mmu_page_path parents; struct kvm_mmu_pages pages; - if (parent->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { @@ -2921,7 +2916,7 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (sp->unsync) continue; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } @@ -3020,7 +3015,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); - if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled()) { pte_access &= ~ACC_EXEC_MASK; } @@ -3033,7 +3028,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (level > PT_PAGE_TABLE_LEVEL) + if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, @@ -3103,8 +3098,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (level > PT_PAGE_TABLE_LEVEL && - !is_large_pte(*sptep)) { + if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; @@ -3125,7 +3119,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) @@ -3228,7 +3222,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp_ad_disabled(sp)) return; - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -3241,12 +3235,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte_t *pte; int level; - BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || - PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || - PT_PDPE_LEVEL != (int)PG_LEVEL_1G); - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() @@ -3260,7 +3250,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); if (unlikely(!pte)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; return level; } @@ -3274,28 +3264,28 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; - if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) - return PT_PAGE_TABLE_LEVEL; + if (unlikely(max_level == PG_LEVEL_4K)) + return PG_LEVEL_4K; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); if (!slot) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; max_level = min(max_level, max_page_level); - for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } - if (max_level == PT_PAGE_TABLE_LEVEL) - return PT_PAGE_TABLE_LEVEL; + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return level; level = min(level, max_level); @@ -3317,7 +3307,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, int level = *levelp; u64 spte = *it.sptep; - if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + if (it.level == level && level > PG_LEVEL_4K && is_nx_huge_page_enabled() && is_shadow_present_pte(spte) && !is_large_pte(spte)) { @@ -3574,7 +3564,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) break; } @@ -3586,7 +3576,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virt/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.rst to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, @@ -3666,7 +3656,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, &invalid_list); mmu->root_hpa = INVALID_PAGE; } - mmu->root_cr3 = 0; + mmu->root_pgd = 0; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3686,58 +3676,64 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, + u8 level, bool direct) { struct kvm_mmu_page *sp; + + spin_lock(&vcpu->kvm->mmu_lock); + + if (make_mmu_pages_available(vcpu)) { + spin_unlock(&vcpu->kvm->mmu_lock); + return INVALID_PAGE; + } + sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + ++sp->root_count; + + spin_unlock(&vcpu->kvm->mmu_lock); + return __pa(sp->spt); +} + +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +{ + u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + hpa_t root; unsigned i; - if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { - spin_lock(&vcpu->kvm->mmu_lock); - if(make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, 0, 0, - vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu->root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) { + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - MMU_WARN_ON(VALID_PAGE(root)); - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, true); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); - /* root_cr3 is ignored for direct MMUs. */ - vcpu->arch.mmu->root_cr3 = 0; + /* root_pgd is ignored for direct MMUs. */ + vcpu->arch.mmu->root_pgd = 0; return 0; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - struct kvm_mmu_page *sp; u64 pdptr, pm_mask; - gfn_t root_gfn, root_cr3; + gfn_t root_gfn, root_pgd; + hpa_t root; int i; - root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); - root_gfn = root_cr3 >> PAGE_SHIFT; + root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3747,22 +3743,14 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * write-protect the guests page table root. */ if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu->root_hpa; - - MMU_WARN_ON(VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, root_gfn, 0, + vcpu->arch.mmu->shadow_root_level, false); + if (!VALID_PAGE(root)) return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->root_hpa = root; - goto set_root_cr3; + goto set_root_pgd; } /* @@ -3775,9 +3763,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu->pae_root[i]; - - MMU_WARN_ON(VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { @@ -3788,17 +3774,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu_check_root(vcpu, root_gfn)) return 1; } - spin_lock(&vcpu->kvm->mmu_lock); - if (make_mmu_pages_available(vcpu) < 0) { - spin_unlock(&vcpu->kvm->mmu_lock); - return -ENOSPC; - } - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, - 0, ACC_ALL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); + root = mmu_alloc_root(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, false); + if (!VALID_PAGE(root)) + return -ENOSPC; vcpu->arch.mmu->pae_root[i] = root | pm_mask; } vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); @@ -3828,8 +3808,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } -set_root_cr3: - vcpu->arch.mmu->root_cr3 = root_cr3; +set_root_pgd: + vcpu->arch.mmu->root_pgd = root_pgd; return 0; } @@ -4083,18 +4063,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable) { - struct kvm_memory_slot *slot; + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; + *writable = false; return false; } - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) @@ -4135,7 +4113,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; if (lpage_disallowed) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; @@ -4171,7 +4149,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PT_DIRECTORY_LEVEL, false); + PG_LEVEL_2M, false); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4186,7 +4164,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_reason) { + switch (vcpu->arch.apf.host_apf_flags) { default: trace_kvm_page_fault(fault_address, error_code); @@ -4196,13 +4174,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, insn_len); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address, 0); + kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wake(fault_address); local_irq_enable(); @@ -4217,8 +4195,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, { int max_level; - for (max_level = PT_MAX_HUGEPAGE_LEVEL; - max_level > PT_PAGE_TABLE_LEVEL; + for (max_level = KVM_MAX_HUGEPAGE_LEVEL; + max_level > PG_LEVEL_4K; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); @@ -4237,7 +4215,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; @@ -4245,51 +4223,50 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3, +static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || cr3 == root->cr3) && + return (role.direct || pgd == root->pgd) && VALID_PAGE(root->hpa) && page_header(root->hpa) && role.word == page_header(root->hpa)->role.word; } /* - * Find out if a previously cached root matching the new CR3/role is available. + * Find out if a previously cached root matching the new pgd/role is available. * The current root is also inserted into the cache. * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is * returned. * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and * false is returned. This root should now be freed by the caller. */ -static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; struct kvm_mmu_root_info root; struct kvm_mmu *mmu = vcpu->arch.mmu; - root.cr3 = mmu->root_cr3; + root.pgd = mmu->root_pgd; root.hpa = mmu->root_hpa; - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { swap(root, mmu->prev_roots[i]); - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) break; } mmu->root_hpa = root.hpa; - mmu->root_cr3 = root.cr3; + mmu->root_pgd = root.pgd; return i < KVM_MMU_NUM_PREV_ROOTS; } -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role, - bool skip_tlb_flush) +static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, + union kvm_mmu_page_role new_role) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -4299,70 +4276,59 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, * later if necessary. */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - mmu->root_level >= PT64_ROOT_4LEVEL) { - if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) - return false; - - if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU generation - * number. However, changing the generation number is - * accompanied by KVM_REQ_MMU_RELOAD, which will free - * the root set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_tlb_flush) { - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - - /* - * The last MMIO access's GVA and GPA are cached in the - * VCPU. When switching to a new CR3, that GVA->GPA - * mapping may no longer be valid. So clear any cached - * MMIO info even when we don't need to sync the shadow - * page tables. - */ - vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - - __clear_sp_write_flooding_count( - page_header(mmu->root_hpa)); - - return true; - } - } + mmu->root_level >= PT64_ROOT_4LEVEL) + return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && + cached_root_available(vcpu, new_pgd, new_role); return false; } -static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role, - bool skip_tlb_flush) + bool skip_tlb_flush, bool skip_mmu_sync) { - if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, - KVM_MMU_ROOT_CURRENT); + if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); + return; + } + + /* + * It's possible that the cached previous root page is obsolete because + * of a change in the MMU generation number. However, changing the + * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will + * free the root set here and allocate a new one. + */ + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + + if (!skip_mmu_sync || force_flush_and_sync_on_reuse) + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + if (!skip_tlb_flush || force_flush_and_sync_on_reuse) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + + /* + * The last MMIO access's GVA and GPA are cached in the VCPU. When + * switching to a new CR3, that GVA->GPA mapping may no longer be + * valid. So clear any cached MMIO info even when we don't need to sync + * the shadow page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); + + __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, + bool skip_mmu_sync) { - __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush); + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush, skip_mmu_sync); } -EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); +EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static unsigned long get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } -static void inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - vcpu->arch.mmu->inject_page_fault(vcpu, fault); -} - static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access, int *nr_present) { @@ -4391,11 +4357,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, gpte &= level - mmu->last_nonleaf_level; /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + gpte |= level - PG_LEVEL_4K - 1; return gpte & PT_PAGE_SIZE_MASK; } @@ -4909,7 +4875,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_x86_ops.get_tdp_level(vcpu); + role.base.level = vcpu->arch.tdp_level; role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4928,9 +4894,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->mmu_role.as_u64 = new_role.as_u64; context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + context->shadow_root_level = vcpu->arch.tdp_level; context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4986,7 +4952,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) { struct kvm_mmu *context = vcpu->arch.mmu; union kvm_mmu_role new_role = @@ -4995,11 +4961,11 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) if (new_role.as_u64 == context->mmu_role.as_u64) return; - if (!is_paging(vcpu)) + if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); - else if (is_long_mode(vcpu)) + else if (efer & EFER_LMA) paging64_init_context(vcpu, context); - else if (is_pae(vcpu)) + else if (cr4 & X86_CR4_PAE) paging32E_init_context(vcpu, context); else paging32_init_context(vcpu, context); @@ -5047,7 +5013,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -5077,7 +5043,11 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.mmu; - kvm_init_shadow_mmu(vcpu); + kvm_init_shadow_mmu(vcpu, + kvm_read_cr0_bits(vcpu, X86_CR0_PG), + kvm_read_cr4_bits(vcpu, X86_CR4_PAE), + vcpu->arch.efer); + context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -5097,6 +5067,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->inject_page_fault = kvm_inject_page_fault; /* + * L2 page tables are never shadowed, so there is no need to sync + * SPTEs. + */ + g_context->invlpg = NULL; + + /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using @@ -5183,7 +5159,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush(vcpu, true); + kvm_x86_ops.tlb_flush_current(vcpu); out: return r; } @@ -5202,7 +5178,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *new) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (sp->role.level != PG_LEVEL_4K) { ++vcpu->kvm->stat.mmu_pde_zapped; return; } @@ -5260,7 +5236,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); @@ -5497,37 +5473,54 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa) { - struct kvm_mmu *mmu = vcpu->arch.mmu; int i; - /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + /* It's actually a GPA for vcpu->arch.guest_mmu. */ + if (mmu != &vcpu->arch.guest_mmu) { + /* INVLPG on a non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + + kvm_x86_ops.tlb_flush_gva(vcpu, gva); + } + + if (!mmu->invlpg) return; - mmu->invlpg(vcpu, gva, mmu->root_hpa); + if (root_hpa == INVALID_PAGE) { + mmu->invlpg(vcpu, gva, mmu->root_hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. + * + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + } else { + mmu->invlpg(vcpu, gva, root_hpa); + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); - kvm_x86_ops.tlb_flush_gva(vcpu, gva); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -5541,7 +5534,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } @@ -5574,9 +5567,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) if (tdp_enabled) max_page_level = tdp_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PT_PDPE_LEVEL; + max_page_level = PG_LEVEL_1G; else - max_page_level = PT_DIRECTORY_LEVEL; + max_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5632,24 +5625,24 @@ static __always_inline bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, + PG_LEVEL_4K, lock_flush_tlb); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -5672,7 +5665,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5695,13 +5688,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_cr3 = 0; + vcpu->arch.root_mmu.root_pgd = 0; vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_cr3 = 0; + vcpu->arch.guest_mmu.root_pgd = 0; vcpu->arch.guest_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5859,7 +5852,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true); } } @@ -5881,7 +5875,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, PT_MAX_HUGEPAGE_LEVEL, false); + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock); /* @@ -6142,27 +6136,18 @@ static void kvm_set_mmio_spte_mask(void) u64 mask; /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (shadow_phys_bits == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); } static bool get_nx_auto_mode(void) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index ddc1ec3bdacd..a7bcde34d1f2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -61,7 +61,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, { int index, val; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); val = slot->arch.gfn_track[mode][index]; @@ -151,7 +151,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return false; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 9bdf9b7d9a96..a6d484ea110b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -75,7 +75,7 @@ #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) -#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) /* * The guest_walker structure emulates the behavior of the hardware page @@ -165,22 +165,22 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long pfn; unsigned long paddr; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return -EFAULT; } pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; paddr = pfn << PAGE_SHIFT; table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); if (!table) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return -EFAULT; } ret = CMPXCHG(&table[index], orig_pte, new_pte); memunmap(table); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } return (ret != orig_pte); @@ -198,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) goto no_present; return false; @@ -436,7 +436,7 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); @@ -552,7 +552,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -575,7 +575,7 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, u64 mask; int r, index; - if (level == PT_PAGE_TABLE_LEVEL) { + if (level == PG_LEVEL_4K) { mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); @@ -600,7 +600,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, sp = page_header(__pa(sptep)); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; if (sp->role.direct) @@ -812,7 +812,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, if (!r) { pgprintk("%s: guest page fault\n", __func__); if (!prefault) - inject_page_fault(vcpu, &walker.fault); + kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } @@ -828,7 +828,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (lpage_disallowed || is_self_change_mapping) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -884,7 +884,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -1070,7 +1070,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PT_PAGE_TABLE_LEVEL, + pte_access, PG_LEVEL_4K, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ca39f62aabc6..9d2844f87f6d 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -100,7 +100,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) sp = page_header(__pa(sptep)); if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { + if (level != PG_LEVEL_4K) { audit_printk(vcpu->kvm, "unsync sp: %p " "level = %d\n", sp, level); return; @@ -176,7 +176,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) { int i; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) + if (sp->role.level != PG_LEVEL_4K) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); + rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index a5078841bdac..b86346903f2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -397,9 +397,9 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) __set_bit(pmc->idx, pmu->pmc_in_use); } -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr, data); + return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a6c78a797cb1..ab85eed8a6cc 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void (*refresh)(struct kvm_vcpu *vcpu); void (*init)(struct kvm_vcpu *vcpu); @@ -147,7 +147,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 90a1ca939627..8a6db11dcb43 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -19,11 +19,14 @@ #include <linux/kernel.h> #include <asm/msr-index.h> +#include <asm/debugreg.h> #include "kvm_emulate.h" #include "trace.h" #include "mmu.h" #include "x86.h" +#include "cpuid.h" +#include "lapic.h" #include "svm.h" static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -58,7 +61,7 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); - u64 cr3 = svm->nested.nested_cr3; + u64 cr3 = svm->nested.ctl.nested_cr3; u64 pdpte; int ret; @@ -73,19 +76,22 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - return svm->nested.nested_cr3; + return svm->nested.ctl.nested_cr3; } static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *hsave = svm->nested.hsave; + WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu); + kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -98,8 +104,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h; - struct nested_state *g; + struct vmcb_control_area *c, *h, *g; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -108,14 +113,16 @@ void recalc_intercepts(struct vcpu_svm *svm) c = &svm->vmcb->control; h = &svm->nested.hsave->control; - g = &svm->nested; + g = &svm->nested.ctl; + + svm->nested.host_intercept_exceptions = h->intercept_exceptions; c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); @@ -137,11 +144,9 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercept |= g->intercept; } -static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +static void copy_vmcb_control_area(struct vmcb_control_area *dst, + struct vmcb_control_area *from) { - struct vmcb_control_area *dst = &dst_vmcb->control; - struct vmcb_control_area *from = &from_vmcb->control; - dst->intercept_cr = from->intercept_cr; dst->intercept_dr = from->intercept_dr; dst->intercept_exceptions = from->intercept_exceptions; @@ -149,7 +154,7 @@ static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; + /* asid not copied, it is handled manually for svm->vmcb. */ dst->tlb_ctl = from->tlb_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; @@ -178,7 +183,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -189,7 +194,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; - offset = svm->nested.vmcb_msrpm + (p * 4); + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) return false; @@ -202,41 +207,111 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } -static bool nested_vmcb_checks(struct vmcb *vmcb) +static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((vmcb->save.efer & EFER_SVME) == 0) + if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; - if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if (control->asid == 0) return false; - if (vmcb->control.asid == 0) - return false; - - if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && + if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled) return false; return true; } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map) +static bool nested_vmcb_checks(struct vmcb *vmcb) { - bool evaluate_pending_interrupts = - is_intercept(svm, INTERCEPT_VINTR) || - is_intercept(svm, INTERCEPT_IRET); + if ((vmcb->save.efer & EFER_SVME) == 0) + return false; - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && + (vmcb->save.cr0 & X86_CR0_NW)) + return false; - if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; - nested_svm_init_mmu_context(&svm->vcpu); + return nested_vmcb_check_controls(&vmcb->control); +} + +static void load_nested_vmcb_control(struct vcpu_svm *svm, + struct vmcb_control_area *control) +{ + copy_vmcb_control_area(&svm->nested.ctl, control); + + /* Copy it here because nested_svm_check_controls will check it. */ + svm->nested.ctl.asid = control->asid; + svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; + svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; +} + +/* + * Synchronize fields that are written by the processor, so that + * they can be copied back into the nested_vmcb. + */ +void sync_nested_vmcb_control(struct vcpu_svm *svm) +{ + u32 mask; + svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; + svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; + + /* Only a few fields of int_ctl are written by the processor. */ + mask = V_IRQ_MASK | V_TPR_MASK; + if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && + is_intercept(svm, SVM_EXIT_VINTR)) { + /* + * In order to request an interrupt window, L0 is usurping + * svm->vmcb->control.int_ctl and possibly setting V_IRQ + * even if it was clear in L1's VMCB. Restoring it would be + * wrong. However, in this case V_IRQ will remain true until + * interrupt_window_interception calls svm_clear_vintr and + * restores int_ctl. We can just leave it aside. + */ + mask &= ~V_IRQ_MASK; } + svm->nested.ctl.int_ctl &= ~mask; + svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; +} + +/* + * Transfer any event that L0 or L1 wanted to inject into L2 to + * EXIT_INT_INFO. + */ +static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, + struct vmcb *nested_vmcb) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 exit_int_info = 0; + unsigned int nr; + + if (vcpu->arch.exception.injected) { + nr = vcpu->arch.exception.nr; + exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; + + if (vcpu->arch.exception.has_error_code) { + exit_int_info |= SVM_EVTINJ_VALID_ERR; + nested_vmcb->control.exit_int_info_err = + vcpu->arch.exception.error_code; + } + + } else if (vcpu->arch.nmi_injected) { + exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + } else if (vcpu->arch.interrupt.injected) { + nr = vcpu->arch.interrupt.nr; + exit_int_info = nr | SVM_EVTINJ_VALID; + + if (vcpu->arch.interrupt.soft) + exit_int_info |= SVM_EVTINJ_TYPE_SOFT; + else + exit_int_info |= SVM_EVTINJ_TYPE_INTR; + } + + nested_vmcb->control.exit_int_info = exit_int_info; +} + +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +{ /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -248,14 +323,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = nested_vmcb->save.cr3; - svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); @@ -267,40 +335,36 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->save.rsp = nested_vmcb->save.rsp; svm->vmcb->save.rip = nested_vmcb->save.rip; svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; +} - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; +static void nested_prepare_vmcb_control(struct vcpu_svm *svm) +{ + const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) + nested_svm_init_mmu_context(&svm->vcpu); - /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; - svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; - svm->nested.intercept = nested_vmcb->control.intercept; + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); - svm_flush_tlb(&svm->vcpu, true); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) - svm->vcpu.arch.hflags |= HF_VINTR_MASK; - else - svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + svm_flush_tlb(&svm->vcpu); - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = + svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; - svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; - svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; - svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; - svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + svm->vmcb->control.int_ctl = + (svm->nested.ctl.int_ctl & ~mask) | + (svm->nested.hsave->control.int_ctl & mask); - svm->vmcb->control.pause_filter_count = - nested_vmcb->control.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = - nested_vmcb->control.pause_filter_thresh; + svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; + svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; + svm->vmcb->control.int_state = svm->nested.ctl.int_state; + svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; + svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - kvm_vcpu_unmap(&svm->vcpu, map, true); + svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; + svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; /* Enter Guest-Mode */ enter_guest_mode(&svm->vcpu); @@ -311,25 +375,18 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, */ recalc_intercepts(svm); - svm->nested.vmcb = vmcb_gpa; + mark_all_dirty(svm->vmcb); +} - /* - * If L1 had a pending IRQ/NMI before executing VMRUN, - * which wasn't delivered because it was disallowed (e.g. - * interrupts disabled), L0 needs to evaluate if this pending - * event should cause an exit from L2 to L1 or be delivered - * directly to L2. - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request. However, VMRUN can unblock interrupts - * by implicitly setting GIF, so force L0 to perform pending event - * evaluation by requesting a KVM_REQ_EVENT. - */ - enable_gif(svm); - if (unlikely(evaluate_pending_interrupts)) - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); +void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb) +{ + svm->nested.vmcb = vmcb_gpa; + load_nested_vmcb_control(svm, &nested_vmcb->control); + nested_prepare_vmcb_save(svm, nested_vmcb); + nested_prepare_vmcb_control(svm); - mark_all_dirty(svm->vmcb); + svm_set_gif(svm, true); } int nested_svm_vmrun(struct vcpu_svm *svm) @@ -341,8 +398,12 @@ int nested_svm_vmrun(struct vcpu_svm *svm) struct kvm_host_map map; u64 vmcb_gpa; - vmcb_gpa = svm->vmcb->save.rax; + if (is_smm(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + vmcb_gpa = svm->vmcb->save.rax; ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); @@ -360,10 +421,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - - kvm_vcpu_unmap(&svm->vcpu, &map, true); - - return ret; + goto out; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -403,9 +461,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) else hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - copy_vmcb_control_area(hsave, vmcb); + copy_vmcb_control_area(&hsave->control, &vmcb->control); - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); + svm->nested.nested_run_pending = 1; + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); if (!nested_svm_vmrun_msrpm(svm)) { svm->vmcb->control.exit_code = SVM_EXIT_ERR; @@ -416,6 +475,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_svm_vmexit(svm); } +out: + kvm_vcpu_unmap(&svm->vcpu, &map, true); + return ret; } @@ -443,13 +505,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); if (rc) { if (rc == -EINVAL) @@ -462,9 +517,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); svm->nested.vmcb = 0; + WARN_ON_ONCE(svm->nested.nested_run_pending); + + /* in case we halted in L2 */ + svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; /* Give the current vmcb to the guest */ - disable_gif(svm); + svm_set_gif(svm, false); nested_vmcb->save.es = vmcb->save.es; nested_vmcb->save.cs = vmcb->save.cs; @@ -478,62 +537,42 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = vmcb->save.rip; - nested_vmcb->save.rsp = vmcb->save.rsp; - nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); + nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); + nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; nested_vmcb->save.cpl = vmcb->save.cpl; - nested_vmcb->control.int_ctl = vmcb->control.int_ctl; - nested_vmcb->control.int_vector = vmcb->control.int_vector; nested_vmcb->control.int_state = vmcb->control.int_state; nested_vmcb->control.exit_code = vmcb->control.exit_code; nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; - nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, nested_vmcb); if (svm->nrips_enabled) nested_vmcb->control.next_rip = vmcb->control.next_rip; - /* - * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have - * to make sure that we do not lose injected events. So check event_inj - * here and copy it to exit_int_info if it is valid. - * Exit_int_info and event_inj can't be both valid because the case - * below only happens on a VMRUN instruction intercept which has - * no valid exit_int_info set. - */ - if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { - struct vmcb_control_area *nc = &nested_vmcb->control; - - nc->exit_int_info = vmcb->control.event_inj; - nc->exit_int_info_err = vmcb->control.event_inj_err; - } - - nested_vmcb->control.tlb_ctl = 0; - nested_vmcb->control.event_inj = 0; - nested_vmcb->control.event_inj_err = 0; + nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; + nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; + nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; nested_vmcb->control.pause_filter_count = svm->vmcb->control.pause_filter_count; nested_vmcb->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* We always set V_INTR_MASKING and remember the old value in hflags */ - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - /* Restore the original control entries */ - copy_vmcb_control_area(vmcb, hsave); + copy_vmcb_control_area(&vmcb->control, &hsave->control); - svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = + svm->vcpu.arch.l1_tsc_offset; - svm->nested.nested_cr3 = 0; + svm->nested.ctl.nested_cr3 = 0; /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; @@ -561,6 +600,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) mark_all_dirty(svm->vmcb); + trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, + nested_vmcb->control.exit_info_1, + nested_vmcb->control.exit_info_2, + nested_vmcb->control.exit_int_info, + nested_vmcb->control.exit_int_info_err, + KVM_ISA_SVM); + kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); @@ -578,12 +624,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +void svm_leave_nested(struct vcpu_svm *svm) +{ + if (is_guest_mode(&svm->vcpu)) { + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + + svm->nested.nested_run_pending = 0; + leave_guest_mode(&svm->vcpu); + copy_vmcb_control_area(&vmcb->control, &hsave->control); + nested_svm_uninit_mmu_context(&svm->vcpu); + } +} + static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { u32 offset, msr, value; int write, mask; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -597,37 +659,12 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* Offset is in 32 bit units but need in 8 bit units */ offset *= 4; - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) return NESTED_EXIT_DONE; return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } -/* DB exceptions for our internal use must not cause vmexit */ -static int nested_svm_intercept_db(struct vcpu_svm *svm) -{ - unsigned long dr6; - - /* if we're not singlestepping, it's not ours */ - if (!svm->nmi_singlestep) - return NESTED_EXIT_DONE; - - /* if it's not a singlestep exception, it's not ours */ - if (kvm_get_dr(&svm->vcpu, 6, &dr6)) - return NESTED_EXIT_DONE; - if (!(dr6 & DR6_BS)) - return NESTED_EXIT_DONE; - - /* if the guest is singlestepping, it should get the vmexit */ - if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { - disable_nmi_singlestep(svm); - return NESTED_EXIT_DONE; - } - - /* it's ours, the nested hypervisor must not see this one */ - return NESTED_EXIT_HOST; -} - static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { unsigned port, size, iopm_len; @@ -635,13 +672,13 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - gpa = svm->nested.vmcb_iopm + (port / 8); + gpa = svm->nested.ctl.iopm_base_pa + (port / 8); start_bit = port % 8; iopm_len = (start_bit + size > 8) ? 2 : 1; mask = (0xf >> (4 - size)) << start_bit; @@ -667,28 +704,23 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.intercept_cr & bit) + if (svm->nested.ctl.intercept_cr & bit) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.intercept_dr & bit) + if (svm->nested.ctl.intercept_dr & bit) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { - u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) { - if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) - vmexit = nested_svm_intercept_db(svm); - else - vmexit = NESTED_EXIT_DONE; - } - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->vcpu.arch.exception.nested_apf != 0) - vmexit = NESTED_EXIT_DONE; + /* + * Host-intercepted exceptions have been checked already in + * nested_svm_exit_special. There is nothing to do here, + * the vmexit is injected by svm_check_nested_events. + */ + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_ERR: { @@ -697,7 +729,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.intercept & exit_bits) + if (svm->nested.ctl.intercept & exit_bits) vmexit = NESTED_EXIT_DONE; } } @@ -733,62 +765,140 @@ int nested_svm_check_permissions(struct vcpu_svm *svm) return 0; } -int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code) +static bool nested_exit_on_exception(struct vcpu_svm *svm) { - int vmexit; + unsigned int nr = svm->vcpu.arch.exception.nr; - if (!is_guest_mode(&svm->vcpu)) - return 0; + return (svm->nested.ctl.intercept_exceptions & (1 << nr)); +} - vmexit = nested_svm_intercept(svm); - if (vmexit != NESTED_EXIT_DONE) - return 0; +static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +{ + unsigned int nr = svm->vcpu.arch.exception.nr; svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; + + if (svm->vcpu.arch.exception.has_error_code) + svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; - else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + if (nr == PF_VECTOR) { + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + } else if (nr == DB_VECTOR) { + /* See inject_pending_event. */ + kvm_deliver_exception_payload(&svm->vcpu); + if (svm->vcpu.arch.dr7 & DR7_GD) { + svm->vcpu.arch.dr7 &= ~DR7_GD; + kvm_update_dr7(&svm->vcpu); + } + } else + WARN_ON(svm->vcpu.arch.exception.has_payload); - svm->nested.exit_required = true; - return vmexit; + nested_svm_vmexit(svm); +} + +static void nested_svm_smi(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_SMI; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); +} + +static void nested_svm_nmi(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); } static void nested_svm_intr(struct vcpu_svm *svm) { + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - /* nested_svm_vmexit this gets called afterwards from handle_exit */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + nested_svm_vmexit(svm); } -static bool nested_exit_on_intr(struct vcpu_svm *svm) +static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.intercept & 1ULL); + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); } -int svm_check_nested_events(struct kvm_vcpu *vcpu) +static void nested_svm_init(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_INIT; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); +} + + +static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool block_nested_events = - kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; + kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending; + struct kvm_lapic *apic = vcpu->arch.apic; - if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_init(svm)) + return 0; + nested_svm_init(svm); + return 0; + } + + if (vcpu->arch.exception.pending) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_exception(svm)) + return 0; + nested_svm_inject_exception_vmexit(svm); + return 0; + } + + if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_smi(svm)) + return 0; + nested_svm_smi(svm); + return 0; + } + + if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_nmi(svm)) + return 0; + nested_svm_nmi(svm); + return 0; + } + + if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_intr(svm)) + return 0; nested_svm_intr(svm); return 0; } @@ -803,21 +913,170 @@ int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: - return NESTED_EXIT_HOST; case SVM_EXIT_NPF: - /* For now we are always handling NPFs when using them */ - if (npt_enabled) + return NESTED_EXIT_HOST; + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + + if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) + else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && + svm->vcpu.arch.apf.host_apf_flags) + /* Trap async PF even if not shadowing */ return NESTED_EXIT_HOST; break; + } default: break; } return NESTED_EXIT_CONTINUE; } + +static int svm_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_svm *svm; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = KVM_STATE_NESTED_FORMAT_SVM, + .size = sizeof(kvm_state), + }; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + + if (!vcpu) + return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE; + + svm = to_svm(vcpu); + + if (user_data_size < kvm_state.size) + goto out; + + /* First fill in the header and copy it out. */ + if (is_guest_mode(vcpu)) { + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (svm->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + + if (gif_set(svm)) + kvm_state.flags |= KVM_STATE_NESTED_GIF_SET; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!is_guest_mode(vcpu)) + goto out; + + /* + * Copy over the full size of the VMCB rather than just the size + * of the structs. + */ + if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) + return -EFAULT; + if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, + sizeof(user_vmcb->control))) + return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + sizeof(user_vmcb->save))) + return -EFAULT; + +out: + return kvm_state.size; +} + +static int svm_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *hsave = svm->nested.hsave; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + struct vmcb_control_area ctl; + struct vmcb_save_area save; + u32 cr0; + + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) + return -EINVAL; + + if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GIF_SET)) + return -EINVAL; + + /* + * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's + * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. + */ + if (!(vcpu->arch.efer & EFER_SVME)) { + /* GIF=1 and no guest mode are required if SVME=0. */ + if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET) + return -EINVAL; + } + + /* SMM temporarily disables SVM, so we cannot be in guest mode. */ + if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { + svm_leave_nested(svm); + goto out_set_gif; + } + + if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) + return -EINVAL; + if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) + return -EINVAL; + if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) + return -EFAULT; + if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) + return -EFAULT; + + if (!nested_vmcb_check_controls(&ctl)) + return -EINVAL; + + /* + * Processor state contains L2 state. Check that it is + * valid for guest mode (see nested_vmcb_checks). + */ + cr0 = kvm_read_cr0(vcpu); + if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) + return -EINVAL; + + /* + * Validate host state saved from before VMRUN (see + * nested_svm_check_permissions). + * TODO: validate reserved bits for all saved state. + */ + if (!(save.cr0 & X86_CR0_PG)) + return -EINVAL; + + /* + * All checks done, we can enter guest mode. L1 control fields + * come from the nested save state. Guest state is already + * in the registers, the save area of the nested state instead + * contains saved L1 state. + */ + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); + hsave->save = save; + + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + load_nested_vmcb_control(svm, &ctl); + nested_prepare_vmcb_control(svm); + +out_set_gif: + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; +} + +struct kvm_x86_nested_ops svm_nested_ops = { + .check_events = svm_check_nested_events, + .get_state = svm_get_nested_state, + .set_state = svm_set_nested_state, +}; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index ce0b10fe5e2b..035da07500e8 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -215,21 +215,22 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } -static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; + u32 msr = msr_info->index; /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - *data = pmc_read_counter(pmc); + msr_info->data = pmc_read_counter(pmc); return 0; } /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - *data = pmc->eventsel; + msr_info->data = pmc->eventsel; return 0; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0e3fc311d7da..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/psp-sev.h> +#include <linux/pagemap.h> #include <linux/swap.h> #include "x86.h" @@ -335,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); @@ -344,7 +344,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); + npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; @@ -1117,7 +1117,7 @@ int __init sev_hardware_setup(void) /* Maximum number of encrypted guests supported simultaneously */ max_sev_asid = cpuid_ecx(0x8000001F); - if (!max_sev_asid) + if (!svm_sev_enabled()) return 1; /* Minimum ASID value that should be used for SEV guest */ @@ -1156,6 +1156,9 @@ err: void sev_hardware_teardown(void) { + if (!svm_sev_enabled()) + return; + bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2be5bbae3a40..7502cd65528f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -33,6 +33,7 @@ #include <asm/debugreg.h> #include <asm/kvm_para.h> #include <asm/irq_remapping.h> +#include <asm/mce.h> #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> @@ -264,6 +265,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu) void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.efer = efer; if (!npt_enabled) { @@ -274,8 +276,13 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + } + + svm->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) @@ -318,9 +325,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; } else { - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - pr_err("%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); kvm_rip_write(vcpu, svm->next_rip); } svm_set_interrupt_shadow(vcpu, 0); @@ -333,17 +337,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned nr = vcpu->arch.exception.nr; bool has_error_code = vcpu->arch.exception.has_error_code; - bool reinject = vcpu->arch.exception.injected; u32 error_code = vcpu->arch.exception.error_code; - /* - * If we are within a nested VM we'd better #VMEXIT and let the guest - * handle the exception - */ - if (!reinject && - nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - kvm_deliver_exception_payload(&svm->vcpu); if (nr == BP_VECTOR && !nrips) { @@ -780,7 +775,7 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) @@ -890,7 +885,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); + kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -953,16 +948,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (is_guest_mode(vcpu)) - return svm->nested.hsave->control.tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1208,6 +1193,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); + clear_page(svm->nested.hsave); svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); @@ -1364,12 +1350,13 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static inline void svm_enable_vintr(struct vcpu_svm *svm) +static void svm_set_vintr(struct vcpu_svm *svm) { struct vmcb_control_area *control; /* The following fields are ignored when AVIC is enabled */ WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + set_intercept(svm, INTERCEPT_VINTR); /* * This is just a dummy VINTR to actually cause a vmexit to happen. @@ -1383,18 +1370,19 @@ static inline void svm_enable_vintr(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } -static void svm_set_vintr(struct vcpu_svm *svm) -{ - set_intercept(svm, INTERCEPT_VINTR); - if (is_intercept(svm, INTERCEPT_VINTR)) - svm_enable_vintr(svm); -} - static void svm_clear_vintr(struct vcpu_svm *svm) { + const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; clr_intercept(svm, INTERCEPT_VINTR); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + /* Drop int_ctl fields related to VINTR injection. */ + svm->vmcb->control.int_ctl &= mask; + if (is_guest_mode(&svm->vcpu)) { + WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != + (svm->nested.ctl.int_ctl & V_TPR_MASK)); + svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; + } + mark_dirty(svm->vmcb, VMCB_INTR); } @@ -1533,14 +1521,6 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ -} - -static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ -} - static void update_cr0_intercept(struct vcpu_svm *svm) { ulong gcr0 = svm->vcpu.arch.cr0; @@ -1603,7 +1583,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu, true); + svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -1672,17 +1652,14 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } -static u64 svm_get_dr6(struct kvm_vcpu *vcpu) +static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) { - return to_svm(vcpu)->vmcb->save.dr6; -} - -static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) -{ - struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->save.dr6 = value; - mark_dirty(svm->vmcb, VMCB_DR); + if (unlikely(value != vmcb->save.dr6)) { + vmcb->save.dr6 = value; + mark_dirty(vmcb, VMCB_DR); + } } static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) @@ -1693,9 +1670,12 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); - vcpu->arch.dr6 = svm_get_dr6(vcpu); + /* + * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here, + * because db_interception might need it. We can do it before vmentry. + */ + vcpu->arch.dr6 = svm->vmcb->save.dr6; vcpu->arch.dr7 = svm->vmcb->save.dr7; - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; set_dr_intercepts(svm); } @@ -1739,7 +1719,8 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { - kvm_queue_exception(&svm->vcpu, DB_VECTOR); + u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1; + kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); return 1; } @@ -1752,6 +1733,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; + kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = DB_VECTOR; @@ -1839,6 +1822,25 @@ static bool is_erratum_383(void) return true; } +/* + * Trigger machine check on the host. We assume all the MSRs are already set up + * by the CPU and that we still run on the same CPU as the MCE occurred on. + * We pass a fake environment to the machine check handler because we want + * the guest to be always treated like user space, no matter what context + * it used internally. + */ +static void kvm_machine_check(void) +{ +#if defined(CONFIG_X86_MCE) + struct pt_regs regs = { + .cs = 3, /* Fake ring 3 no matter what the guest ran on */ + .flags = X86_EFLAGS_IF, + }; + + do_machine_check(®s); +#endif +} + static void svm_handle_mce(struct vcpu_svm *svm) { if (is_erratum_383()) { @@ -1857,11 +1859,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. */ - asm volatile ( - "int $0x12\n"); - /* not sure if we ever come back to this point */ - - return; + kvm_machine_check(); } static int mc_interception(struct vcpu_svm *svm) @@ -1990,6 +1988,38 @@ static int vmrun_interception(struct vcpu_svm *svm) return nested_svm_vmrun(svm); } +void svm_set_gif(struct vcpu_svm *svm, bool value) +{ + if (value) { + /* + * If VGIF is enabled, the STGI intercept is only added to + * detect the opening of the SMI/NMI window; remove it now. + * Likewise, clear the VINTR intercept, we will set it + * again while processing KVM_REQ_EVENT if needed. + */ + if (vgif_enabled(svm)) + clr_intercept(svm, INTERCEPT_STGI); + if (is_intercept(svm, SVM_EXIT_VINTR)) + svm_clear_vintr(svm); + + enable_gif(svm); + if (svm->vcpu.arch.smi_pending || + svm->vcpu.arch.nmi_pending || + kvm_cpu_has_injectable_intr(&svm->vcpu)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } else { + disable_gif(svm); + + /* + * After a CLGI no interrupts should come. But if vGIF is + * in use, we still rely on the VINTR intercept (rather than + * STGI) to detect an open interrupt window. + */ + if (!vgif_enabled(svm)) + svm_clear_vintr(svm); + } +} + static int stgi_interception(struct vcpu_svm *svm) { int ret; @@ -1997,18 +2027,8 @@ static int stgi_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* - * If VGIF is enabled, the STGI intercept is only added to - * detect the opening of the SMI/NMI window; remove it now. - */ - if (vgif_enabled(svm)) - clr_intercept(svm, INTERCEPT_STGI); - ret = kvm_skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - - enable_gif(svm); - + svm_set_gif(svm, true); return ret; } @@ -2020,13 +2040,7 @@ static int clgi_interception(struct vcpu_svm *svm) return 1; ret = kvm_skip_emulated_instruction(&svm->vcpu); - - disable_gif(svm); - - /* After a CLGI no interrupts should come */ - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - svm_clear_vintr(svm); - + svm_set_gif(svm, false); return ret; } @@ -2190,7 +2204,7 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, bool ret = false; u64 intercept; - intercept = svm->nested.intercept; + intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) @@ -2668,8 +2682,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) */ svm_toggle_avic_for_irq_window(&svm->vcpu, true); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; return 1; } @@ -2895,8 +2907,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -2909,12 +2920,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (unlikely(svm->nested.exit_required)) { - nested_svm_vmexit(svm); - svm->nested.exit_required = false; - - return 1; - } + svm_complete_interrupts(svm); if (is_guest_mode(vcpu)) { int vmexit; @@ -2935,8 +2941,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, return 1; } - svm_complete_interrupts(svm); - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2954,10 +2958,10 @@ static int handle_exit(struct kvm_vcpu *vcpu, __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -3046,18 +3050,37 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static int svm_nmi_allowed(struct kvm_vcpu *vcpu) +bool svm_nmi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; - ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); - ret = ret && gif_set(svm) && nested_svm_nmi(svm); + bool ret; + + if (!gif_set(svm)) + return true; + + if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) + return false; + + ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + (svm->vcpu.arch.hflags & HF_NMI_MASK); return ret; } +static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) + return -EBUSY; + + return !svm_nmi_blocked(vcpu); +} + static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3078,19 +3101,46 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) +bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!gif_set(svm) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) - return 0; + if (!gif_set(svm)) + return true; - if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return !!(svm->vcpu.arch.hflags & HF_HIF_MASK); - else - return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); + if (is_guest_mode(vcpu)) { + /* As long as interrupts are being delivered... */ + if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) + ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) + : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + return true; + + /* ... vmexits aren't blocked by the interrupt shadow */ + if (nested_exit_on_intr(svm)) + return false; + } else { + if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + return true; + } + + return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); +} + +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) + return -EBUSY; + + return !svm_interrupt_blocked(vcpu); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -3131,9 +3181,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; /* STGI will cause a vm exit */ } - if (svm->nested.exit_required) - return; /* we're not going to run the guest yet */ - /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) @@ -3153,10 +3200,17 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return 0; } -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * Flush only the current ASID even if the TLB flush was invoked via + * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all + * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and + * unconditionally does a TLB flush on both nested VM-Enter and nested + * VM-Exit (via kvm_mmu_reset_context()). + */ if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; else @@ -3276,10 +3330,21 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } -bool __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + + return EXIT_FASTPATH_NONE; +} + +void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static void svm_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { + fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3287,13 +3352,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; /* - * A vmexit emulation is required before the vcpu can be executed - * again. - */ - if (unlikely(svm->nested.exit_required)) - return; - - /* * Disable singlestep if we're injecting an interrupt/exception. * We don't want our modified rflags to be pushed on the stack where * we might not be able to easily reset them if we disabled NMI @@ -3315,6 +3373,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* + * Run with all-zero DR6 unless needed, so that we can get the exact cause + * of a #DB. + */ + if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + svm_set_dr6(svm, vcpu->arch.dr6); + else + svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM); + clgi(); kvm_load_guest_xsave_state(vcpu); @@ -3330,13 +3397,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - local_irq_enable(); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else @@ -3366,8 +3428,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) reload_tss(vcpu); - local_irq_disable(); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); vcpu->arch.cr2 = svm->vmcb->save.cr2; @@ -3382,6 +3442,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ + exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3389,12 +3450,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + if (is_guest_mode(&svm->vcpu)) { + sync_nested_vmcb_control(svm); + svm->nested.nested_run_pending = 0; + } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_flags = + kvm_read_and_reset_apf_flags(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); @@ -3410,13 +3476,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); mark_all_clean(svm->vmcb); + return exit_fastpath; } -STACK_FRAME_NON_STANDARD(svm_vcpu_run); static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - bool update_guest_cr3 = true; unsigned long cr3; cr3 = __sme_set(root); @@ -3425,18 +3490,13 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (is_guest_mode(vcpu)) - update_guest_cr3 = false; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - cr3 = vcpu->arch.cr3; - else /* CR3 is already up-to-date. */ - update_guest_cr3 = false; + if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + return; + cr3 = vcpu->arch.cr3; } - if (update_guest_cr3) { - svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); - } + svm->vmcb->save.cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void) @@ -3471,7 +3531,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_emulated_msr(int index) +static bool svm_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: @@ -3624,7 +3684,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.intercept; + intercept = svm->nested.ctl.intercept; if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) break; @@ -3712,13 +3772,8 @@ out: return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@ -3733,23 +3788,28 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } -static int svm_smi_allowed(struct kvm_vcpu *vcpu) +bool svm_smi_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); /* Per APM Vol.2 15.22.2 "Response to SMI" */ if (!gif_set(svm)) - return 0; + return true; - if (is_guest_mode(&svm->vcpu) && - svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { - /* TODO: Might need to set exit_info_1 and exit_info_2 here */ - svm->vmcb->control.exit_code = SVM_EXIT_SMI; - svm->nested.exit_required = true; - return 0; - } + return is_smm(vcpu); +} - return 1; +static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) + return -EBUSY; + + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) + return -EBUSY; + + return !svm_smi_blocked(vcpu); } static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -3789,12 +3849,13 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map); + enter_svm_guest_mode(svm, vmcb, nested_vmcb); + kvm_vcpu_unmap(&svm->vcpu, &map, true); } return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3802,9 +3863,9 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) if (vgif_enabled(svm)) set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ - return 1; + } else { + /* We must be in SMM; RSM will cause a vmexit anyway. */ } - return 0; } static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) @@ -3815,6 +3876,13 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) bool is_user = svm_get_cpl(vcpu) == 3; /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: @@ -3872,9 +3940,9 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) /* * TODO: Last condition latch INIT signals on vCPU when * vCPU is in guest-mode and vmcb12 defines intercept on INIT. - * To properly emulate the INIT intercept, SVM should implement - * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit() - * there if an INIT signal is pending. + * To properly emulate the INIT intercept, + * svm_check_nested_events() should call nested_svm_vmexit() + * if an INIT signal is pending. */ return !gif_set(svm) || (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); @@ -3928,8 +3996,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, - .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -3937,16 +4003,16 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .get_dr6 = svm_get_dr6, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .tlb_flush = svm_flush_tlb, + .tlb_flush_all = svm_flush_tlb, + .tlb_flush_current = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, + .tlb_flush_guest = svm_flush_tlb, .run = svm_vcpu_run, .handle_exit = handle_exit, @@ -3987,7 +4053,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_l1_tsc_offset = svm_write_l1_tsc_offset, .load_mmu_pgd = svm_load_mmu_pgd, @@ -4000,6 +4065,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .nested_ops = &svm_nested_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, @@ -4014,14 +4081,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, - - .check_nested_events = svm_check_nested_events, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index df3474f4fb02..6ac4c00a5d82 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -86,25 +86,17 @@ struct nested_state { u64 hsave_msr; u64 vm_cr_msr; u64 vmcb; + u32 host_intercept_exceptions; /* These are the merged vectors */ u32 *msrpm; - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; + /* A VMRUN has started but has not yet been performed, so + * we cannot inject a nested vmexit yet. */ + bool nested_run_pending; - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; + /* cache for control fields of the guest */ + struct vmcb_control_area ctl; }; struct vcpu_svm { @@ -360,8 +352,12 @@ u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); +void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); +bool svm_smi_blocked(struct kvm_vcpu *vcpu); +bool svm_nmi_blocked(struct kvm_vcpu *vcpu); +bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); +void svm_set_gif(struct vcpu_svm *svm, bool value); /* nested.c */ @@ -369,28 +365,31 @@ void disable_nmi_singlestep(struct vcpu_svm *svm); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -/* This function returns true if it is save to enable the nmi window */ -static inline bool nested_svm_nmi(struct vcpu_svm *svm) +static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(&svm->vcpu)) - return true; + struct vcpu_svm *svm = to_svm(vcpu); - if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) - return true; + return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK); +} - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->nested.exit_required = true; +static inline bool nested_exit_on_smi(struct vcpu_svm *svm) +{ + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); +} - return false; +static inline bool nested_exit_on_intr(struct vcpu_svm *svm) +{ + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); } -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); + return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); } void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map); + struct vmcb *nested_vmcb); +void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); @@ -398,8 +397,10 @@ int nested_svm_exit_handled(struct vcpu_svm *svm); int nested_svm_check_permissions(struct vcpu_svm *svm); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -int svm_check_nested_events(struct kvm_vcpu *vcpu); int nested_svm_exit_special(struct vcpu_svm *svm); +void sync_nested_vmcb_control(struct vcpu_svm *svm); + +extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index fa1af90067e9..bf944334003a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -35,7 +36,6 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP - mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 @@ -78,6 +78,7 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_AX /* Enter guest mode */ + sti 1: vmload %_ASM_AX jmp 3f 2: cmpb $0, kvm_rebooting @@ -99,6 +100,13 @@ SYM_FUNC_START(__svm_vcpu_run) ud2 _ASM_EXTABLE(5b, 6b) 7: + cli + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + /* "POP" @regs to RAX. */ pop %_ASM_AX diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 249062f24b94..b66432b015d2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -225,6 +225,14 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define kvm_print_exit_reason(exit_reason, isa) \ + (isa == KVM_ISA_VMX) ? \ + __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ + __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ + (isa == KVM_ISA_VMX) ? \ + __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + /* * Tracepoint for kvm guest exit: */ @@ -250,12 +258,10 @@ TRACE_EVENT(kvm_exit, &__entry->info2); ), - TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", __entry->vcpu_id, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), + __entry->guest_rip, __entry->info1, __entry->info2) ); /* @@ -588,12 +594,10 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; __entry->isa = isa; ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " + TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), + kvm_print_exit_reason(__entry->exit_code, __entry->isa), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -626,13 +630,11 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->isa = isa; ), - TP_printk("reason: %s ext_inf1: 0x%016llx " + TP_printk("reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) + kvm_print_exit_reason(__entry->exit_code, __entry->isa), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) ); /* @@ -1539,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +/* + * Tracepoint for syndbg_set_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_set_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); + +/* + * Tracepoint for syndbg_get_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_get_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8903475f751e..4bbd8b448d22 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -18,6 +18,8 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define PMU_CAP_FW_WRITES (1ULL << 13) + struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline u64 vmx_get_perf_capabilities(void) +{ + /* + * Since counters are virtualized, KVM would support full + * width counting unconditionally, even if the host lacks it. + */ + return PMU_CAP_FW_WRITES; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 303813423c3e..e5325bd0f304 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -4,6 +4,7 @@ #include <linux/smp.h> #include "../hyperv.h" +#include "../cpuid.h" #include "evmcs.h" #include "vmcs.h" #include "vmx.h" @@ -160,14 +161,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), - EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), /* 32 bit rw */ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold, @@ -334,17 +327,18 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ - if (vmx->nested.enlightened_vmcs_enabled) - return (KVM_EVMCS_VERSION << 8) | 1; - - return 0; + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (kvm_cpu_cap_get(X86_FEATURE_VMX) && + vmx->nested.enlightened_vmcs_enabled) + return (KVM_EVMCS_VERSION << 8) | 1; + + return 0; } void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index cbc9ea2de28f..2e7238a57fc1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -303,11 +303,11 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, prev); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_segment_cache_clear(vmx); + vmx_register_cache_reset(vcpu); } /* @@ -328,19 +328,19 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason; + u32 vm_exit_reason; unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { - exit_reason = EXIT_REASON_PML_FULL; + vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; } else if (fault->error_code & PFERR_RSVD_MASK) - exit_reason = EXIT_REASON_EPT_MISCONFIG; + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; else - exit_reason = EXIT_REASON_EPT_VIOLATION; + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); + nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -437,11 +437,6 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -698,11 +693,6 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) VM_EXIT_ACK_INTR_ON_EXIT; } -static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) -{ - return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); -} - static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -927,6 +917,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) } return 0; fail: + /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1; } @@ -1074,34 +1065,81 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* + * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. + * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't + * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). + * Here's why. + * + * If EPT is enabled by L0 a sync is never needed: + * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there + * cannot be unsync'd SPTEs for either L1 or L2. + * + * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter + * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings + * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush + * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't + * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. + * + * If EPT is disabled by L0: + * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 + * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't + * required to invalidate linear mappings (EPT is disabled so there are + * no combined or guest-physical mappings), i.e. L1 can't rely on the + * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). + * + * - however if VPID is disabled by L1, then a sync is needed as L1 expects all + * linear mappings (EPT is disabled so there are no combined or guest-physical + * mappings) to be invalidated on both VM-Enter and VM-Exit. + * + * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which + * additionally checks that L2 has been assigned a VPID (when EPT is disabled). + * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect + * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 + * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has + * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 + * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't + * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush + * stale TLB entries, at which point L0 will sync L2's MMU. + */ +static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) +{ + return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); +} + +/* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { - if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (CC(!nested_cr3_valid(vcpu, cr3))) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return -EINVAL; - } + if (CC(!nested_cr3_valid(vcpu, cr3))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return -EINVAL; + } - /* - * If PAE paging and EPT are both on, CR3 is not used by the CPU and - * must not be dereferenced. - */ - if (is_pae_paging(vcpu) && !nested_ept) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return -EINVAL; - } + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!nested_ept && is_pae_paging(vcpu) && + (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return -EINVAL; } } + /* + * Unconditionally skip the TLB flush on fast CR3 switch, all TLB + * flushes are handled by nested_vmx_transition_tlb_flush(). See + * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. + */ if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); + kvm_mmu_new_pgd(vcpu, cr3, true, + !nested_vmx_transition_mmu_sync(vcpu)); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -1132,11 +1170,48 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } -static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, + bool is_vmenter) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; + /* + * If VPID is disabled, linear and combined mappings are flushed on + * VM-Enter/VM-Exit, and guest-physical mappings are valid only for + * their associated EPTP. + */ + if (!enable_vpid) + return; + + /* + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit. + * + * If VPID is enabled and used by vmc12, but L2 does not have a unique + * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate + * a VPID for L2, flush the current context as the effective ASID is + * common to both L1 and L2. + * + * Defer the flush so that it runs after vmcs02.EPTP has been set by + * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid + * redundant flushes further down the nested pipeline. + * + * If a TLB flush isn't required due to any of the above, and vpid12 is + * changing then the new "virtual" VPID (vpid12) will reuse the same + * "real" VPID (vpid02), and so needs to be sync'd. There is no direct + * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for + * all nested vCPUs. + */ + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } else if (!nested_has_guest_tlb_tag(vcpu)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } else if (is_vmenter && + vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + vmx->nested.last_vpid = vmcs12->virtual_processor_id; + vpid_sync_context(nested_get_vpid02(vcpu)); + } } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@ -1700,10 +1775,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; * vmcs12->page_fault_error_code_mask = * evmcs->page_fault_error_code_mask; * vmcs12->page_fault_error_code_match = @@ -1777,10 +1848,6 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; * evmcs->tpr_threshold = vmcs12->tpr_threshold; * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; * evmcs->exception_bitmap = vmcs12->exception_bitmap; @@ -2020,9 +2087,25 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + + if (!vmx->nested.has_preemption_timer_deadline) { + vmx->nested.preemption_timer_deadline = + vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; + vmx->nested.has_preemption_timer_deadline = true; + } + return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, + u64 preemption_timeout) { - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); /* @@ -2041,7 +2124,8 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); hrtimer_start(&vmx->nested.preemption_timer, - ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); + ktime_add_ns(ktime_get(), preemption_timeout), + HRTIMER_MODE_ABS_PINNED); } static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) @@ -2398,7 +2482,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2447,32 +2531,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); - if (enable_vpid) { - /* - * There is no direct mapping between vpid02 and vpid12, the - * vpid02 is per-vCPU for L0 and reused while the value of - * vpid12 is changed w/ one invvpid during nested vmentry. - * The vpid12 is allocated by L1 for L2, so it will not - * influence global bitmap(for vpid01 and vpid02 allocation) - * even if spawn a lot of nested vCPUs. - */ - if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { - if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { - vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); - } - } else { - /* - * If L1 use EPT, then L0 needs to execute INVEPT on - * EPTP02 instead of EPTP01. Therefore, delay TLB - * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_MMU_PGD. Note that this assumes - * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). - */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); @@ -2883,11 +2942,11 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) + enum vm_entry_failure_code *entry_failure_code) { bool ia32e; - *exit_qual = ENTRY_FAIL_DEFAULT; + *entry_failure_code = ENTRY_FAIL_DEFAULT; if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) @@ -2902,7 +2961,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { - *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; return -EINVAL; } @@ -3028,9 +3087,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* * VMExit clears RFLAGS.IF and DR7, even on a consistency check. */ - local_irq_enable(); if (hw_breakpoint_active()) set_debugreg(__this_cpu_read(cpu_dr7), 7); + local_irq_enable(); preempt_enable(); /* @@ -3194,9 +3253,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + enum vm_entry_failure_code entry_failure_code; bool evaluate_pending_interrupts; - u32 exit_reason = EXIT_REASON_INVALID_STATE; - u32 exit_qual; + u32 exit_reason, failed_index; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -3241,24 +3303,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_VMFAIL; } - if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, + &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; + } } enter_guest_mode(vcpu); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; + } if (from_vmentry) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) + failed_index = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (failed_index) { + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; + } } else { /* * The MMU is not initialized to point at the right entities yet and @@ -3293,8 +3364,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * the timer. */ vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) { + u64 timer_value = vmx_calc_preemption_timer_value(vcpu); + vmx_start_preemption_timer(vcpu, timer_value); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -3322,7 +3395,6 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; @@ -3403,6 +3475,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; + vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; @@ -3632,6 +3705,12 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) vcpu->arch.exception.payload); } +static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + to_vmx(vcpu)->nested.preemption_timer_expired; +} + static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3661,11 +3740,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) /* * Process any exceptions that are not debug traps before MTF. */ - if (vcpu->arch.exception.pending && - !vmx_pending_dbg_trap(vcpu) && - nested_vmx_check_exception(vcpu, &exit_qual)) { + if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_vmx_check_exception(vcpu, &exit_qual)) + goto no_vmexit; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); return 0; } @@ -3678,25 +3757,34 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending && - nested_vmx_check_exception(vcpu, &exit_qual)) { + if (vcpu->arch.exception.pending) { if (block_nested_events) return -EBUSY; + if (!nested_vmx_check_exception(vcpu, &exit_qual)) + goto no_vmexit; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); return 0; } - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { + if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; } - if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + if (block_nested_events) + return -EBUSY; + goto no_vmexit; + } + + if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_nmi(vcpu)) + goto no_vmexit; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK, 0); @@ -3709,13 +3797,16 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { + if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { if (block_nested_events) return -EBUSY; + if (!nested_exit_on_intr(vcpu)) + goto no_vmexit; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; } +no_vmexit: vmx_complete_nested_posted_interrupt(vcpu); return 0; } @@ -3842,12 +3933,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load(&vmx->vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load(&vmx->vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); put_cpu(); } @@ -3876,10 +3967,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -3889,9 +3976,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if (nested_cpu_has_preemption_timer(vmcs12) && - vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && + !vmx->nested.nested_run_pending) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3939,11 +4027,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * which already writes to vmcs12 directly. */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 exit_reason, u32 exit_intr_info, + u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { /* update exit information fields: */ - vmcs12->vm_exit_reason = exit_reason; + vmcs12->vm_exit_reason = vm_exit_reason; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -3998,8 +4086,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + enum vm_entry_failure_code ignored; struct kvm_segment seg; - u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -4034,30 +4122,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - /* - * If vmcs01 doesn't use VPID, CPU flushes TLB on every - * VMEntry/VMExit. Thus, no need to flush TLB. - * - * If vmcs12 doesn't use VPID, L1 expects TLB to be - * flushed on every VMEntry/VMExit. - * - * Otherwise, we can preserve TLB entries as long as we are - * able to tag L1 TLB entries differently than L2 TLB entries. - * - * If vmcs12 uses EPT, we need to execute this flush on EPTP01 - * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_MMU_PGD. - */ - if (enable_vpid && - (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } + nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@ -4204,7 +4275,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * VMFail, like everything else we just need to ensure our * software model is up-to-date. */ - if (enable_ept) + if (enable_ept && is_pae_paging(vcpu)) ept_save_pdptrs(vcpu); kvm_mmu_reset_context(vcpu); @@ -4272,7 +4343,7 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4281,6 +4352,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Service the TLB flush request for L2 before switching to L1. */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) @@ -4292,9 +4367,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); - if (exit_reason != -1) - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (vm_exit_reason != -1) + prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, + exit_intr_info, exit_qualification); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -4344,20 +4419,20 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; - /* - * We are now running in L2, mmu_notifier will force to reload the - * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. - */ - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + if (vmx->nested.reload_vmcs01_apic_access_page) { + vmx->nested.reload_vmcs01_apic_access_page = false; + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + } - if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + if ((vm_exit_reason != -1) && + (enable_shadow_vmcs || vmx->nested.hv_evmcs)) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); @@ -4365,7 +4440,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - if (exit_reason != -1) + if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, vmcs12->idt_vectoring_info_field, @@ -4554,13 +4629,13 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) gva_t gva; struct x86_exception e; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@ -4614,7 +4689,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_ABS_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vpid02 = allocate_vpid(); @@ -4819,7 +4894,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -4869,7 +4944,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@ -4905,7 +4980,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -4943,7 +5018,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) instr_info, false, len, &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } } @@ -5090,7 +5165,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; @@ -5108,23 +5183,33 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } return nested_vmx_succeed(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; - unsigned long type; + unsigned long type, roots_to_free; + struct kvm_mmu *mmu; gva_t gva; struct x86_exception e; struct { u64 eptp, gpa; } operand; + int i; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5148,27 +5233,49 @@ static int handle_invept(struct kvm_vcpu *vcpu) /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } - switch (type) { - case VMX_EPT_EXTENT_GLOBAL: - case VMX_EPT_EXTENT_CONTEXT: /* - * TODO: Sync the necessary shadow EPT roots here, rather than - * at the next emulated VM-entry. + * Nested EPT roots are always held through guest_mmu, + * not root_mmu. */ + mmu = &vcpu->arch.guest_mmu; + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if (!nested_vmx_check_eptp(vcpu, operand.eptp)) + return nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + + roots_to_free = 0; + if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_CURRENT; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (nested_ept_root_matches(mmu->prev_roots[i].hpa, + mmu->prev_roots[i].pgd, + operand.eptp)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + break; + case VMX_EPT_EXTENT_GLOBAL: + roots_to_free = KVM_MMU_ROOTS_ALL; break; default: - BUG_ON(1); + BUG(); break; } + if (roots_to_free) + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + return nested_vmx_succeed(vcpu); } @@ -5208,11 +5315,11 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } if (operand.vpid >> 16) @@ -5226,27 +5333,37 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) is_noncanonical_address(operand.gla, vcpu)) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vpid02, operand.gla); - } else - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vpid02, false); + vpid_sync_context(vpid02); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } + /* + * Sync the shadow page tables if EPT is disabled, L1 is invalidating + * linear mappings for L2 (tagged with L2's VPID). Free all roots as + * VPIDs are not tracked in the MMU role. + * + * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share + * an MMU when EPT is disabled. + * + * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. + */ + if (!enable_ept) + kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, + KVM_MMU_ROOTS_ALL); + return nested_vmx_succeed(vcpu); } @@ -5327,8 +5444,8 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmx_get_intr_info(vcpu), + vmx_get_exit_qual(vcpu)); return 1; } @@ -5379,7 +5496,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -5433,7 +5550,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; @@ -5449,15 +5566,6 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return true; break; case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return false; if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) return true; break; @@ -5533,50 +5641,103 @@ static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, return 1 & (b >> (field & 7)); } -/* - * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). - */ -bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) +static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) { - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON_ONCE(vmx->nested.nested_run_pending); + u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; - if (unlikely(vmx->fail)) { - trace_kvm_nested_vmenter_failed( - "hardware VM-instruction error: ", - vmcs_read32(VM_INSTRUCTION_ERROR)); + if (nested_cpu_has_mtf(vmcs12)) return true; - } - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + /* + * An MTF VM-exit may be injected into the guest by setting the + * interruption-type to 7 (other event) and the vector field to 0. Such + * is the case regardless of the 'monitor trap flag' VM-execution + * control. + */ + return entry_intr_info == (INTR_INFO_VALID_MASK + | INTR_TYPE_OTHER_EVENT); +} + +/* + * Return true if L0 wants to handle an exit from L2 regardless of whether or not + * L1 wants the exit. Only call this when in is_guest_mode (L2). + */ +static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 intr_info; switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) - return false; + return true; else if (is_page_fault(intr_info)) - return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; + return vcpu->arch.apf.host_apf_flags || !enable_ept; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; + return true; else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; + return true; + return false; + case EXIT_REASON_EXTERNAL_INTERRUPT: + return true; + case EXIT_REASON_MCE_DURING_VMENTRY: + return true; + case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return true; + case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ + return true; + case EXIT_REASON_PREEMPTION_TIMER: + return true; + case EXIT_REASON_PML_FULL: + /* We emulate PML support to L1. */ + return true; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return true; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return true; + default: + break; + } + return false; +} + +/* + * Return 1 if L1 wants to intercept an exit from L2. Only call this when in + * is_guest_mode (L2). + */ +static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 intr_info; + + switch (exit_reason) { + case EXIT_REASON_EXCEPTION_NMI: + intr_info = vmx_get_intr_info(vcpu); + if (is_nmi(intr_info)) + return true; + else if (is_page_fault(intr_info)) + return true; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: - return false; + return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: return true; case EXIT_REASON_INTERRUPT_WINDOW: @@ -5633,7 +5794,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has_mtf(vmcs12); + return nested_vmx_exit_handled_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: @@ -5641,7 +5802,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) nested_cpu_has2(vmcs12, SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: - return false; + return true; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: @@ -5653,22 +5814,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) * delivery" only come from vmcs12. */ return true; - case EXIT_REASON_EPT_VIOLATION: - /* - * L0 always deals with the EPT violation. If nested EPT is - * used, and the nested mmu code discovers that the address is - * missing in the guest EPT table (EPT12), the EPT violation - * will be injected with nested_ept_inject_page_fault() - */ - return false; - case EXIT_REASON_EPT_MISCONFIG: - /* - * L2 never uses directly L1's EPT, but rather L0's own EPT - * table (shadow on EPT) or a merged EPT table that L0 built - * (EPT on EPT). So any problems with the structure of the - * table is L0's fault. - */ - return false; case EXIT_REASON_INVPCID: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && @@ -5685,17 +5830,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PREEMPTION_TIMER: - return false; - case EXIT_REASON_PML_FULL: - /* We emulate PML support to L1. */ - return false; - case EXIT_REASON_VMFUNC: - /* VM functions are emulated through L2->L0 vmexits. */ - return false; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return false; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -5705,6 +5839,67 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) } } +/* + * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was + * reflected into L1. + */ +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exit_reason = vmx->exit_reason; + unsigned long exit_qual; + u32 exit_intr_info; + + WARN_ON_ONCE(vmx->nested.nested_run_pending); + + /* + * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM + * has already loaded L2's state. + */ + if (unlikely(vmx->fail)) { + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); + exit_intr_info = 0; + exit_qual = 0; + goto reflect_vmexit; + } + + exit_intr_info = vmx_get_intr_info(vcpu); + exit_qual = vmx_get_exit_qual(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, + vmx->idt_vectoring_info, exit_intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + + /* If L0 (KVM) wants the exit, it trumps L1's desires. */ + if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) + return false; + + /* If L1 doesn't want the exit, handle it in L0. */ + if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) + return false; + + /* + * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For + * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would + * need to be synthesized by querying the in-kernel LAPIC, but external + * interrupts are never reflected to L1 so it's a non-issue. + */ + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + +reflect_vmexit: + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); + return true; +} static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@ -5716,8 +5911,10 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .flags = 0, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), + .hdr.vmx.flags = 0, .hdr.vmx.vmxon_pa = -1ull, .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; @@ -5759,6 +5956,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx->nested.mtf_pending) kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; + + if (nested_cpu_has_preemption_timer(vmcs12) && + vmx->nested.has_preemption_timer_deadline) { + kvm_state.hdr.vmx.flags |= + KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; + kvm_state.hdr.vmx.preemption_timer_deadline = + vmx->nested.preemption_timer_deadline; + } } } @@ -5804,7 +6009,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } - out: return kvm_state.size; } @@ -5827,7 +6031,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 exit_qual; + enum vm_entry_failure_code ignored; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; int ret; @@ -5966,9 +6170,15 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; } + if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { + vmx->nested.has_preemption_timer_deadline = true; + vmx->nested.preemption_timer_deadline = + kvm_state->hdr.vmx.preemption_timer_deadline; + } + if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || - nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; @@ -6014,7 +6224,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. + * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below. */ @@ -6242,8 +6452,7 @@ void nested_vmx_hardware_unsetup(void) } } -__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, - int (*exit_handlers[])(struct kvm_vcpu *)) +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -6279,12 +6488,15 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - ops->check_nested_events = vmx_check_nested_events; - ops->get_nested_state = vmx_get_nested_state; - ops->set_nested_state = vmx_set_nested_state; - ops->get_vmcs12_pages = nested_get_vmcs12_pages; - ops->nested_enable_evmcs = nested_enable_evmcs; - ops->nested_get_evmcs_version = nested_get_evmcs_version; - return 0; } + +struct kvm_x86_nested_ops vmx_nested_ops = { + .check_events = vmx_check_nested_events, + .hv_timer_pending = nested_vmx_preemption_timer_pending, + .get_state = vmx_get_nested_state, + .set_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .enable_evmcs = nested_enable_evmcs, + .get_evmcs_version = nested_get_evmcs_version, +}; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index ac56aefa49e3..758bccc26cf9 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -19,14 +19,13 @@ enum nvmx_vmentry_status { void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); void nested_vmx_hardware_unsetup(void); -__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, - int (*exit_handlers[])(struct kvm_vcpu *)); +__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); -bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, +bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); +void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification); void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); @@ -62,6 +61,13 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs; } +static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; +} + static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ @@ -74,34 +80,6 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Reflect a VM Exit into L1. - */ -static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, - u32 exit_reason) -{ - u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - /* - * At this point, the exit interruption info in exit_intr_info - * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT - * we need to query the in-kernel LAPIC. - */ - WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - } - - nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, - vmcs_readl(EXIT_QUALIFICATION)); - return 1; -} - -/* * Return the cr0 value that a nested guest would read. This is a combination * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). @@ -246,6 +224,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; } +static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); +} + /* * In nested virtualization, check if L1 asked to exit on external interrupts. * For most existing hypervisors, this will always return true. @@ -299,4 +282,6 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid +extern struct kvm_x86_nested_ops vmx_nested_ops; + #endif /* __KVM_X86_VMX_NESTED_H */ diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 19717d0a1100..5f1ac002b4b6 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -268,42 +268,38 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); } -static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) -{ - if (vpid == 0) - return true; - - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); - return true; - } - - return false; -} - static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) return; - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); } static inline void vpid_sync_vcpu_global(void) { - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); } static inline void vpid_sync_context(int vpid) { if (cpu_has_vmx_invvpid_single()) vpid_sync_vcpu_single(vpid); - else + else if (vpid != 0) vpid_sync_vcpu_global(); } +static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return; + + if (cpu_has_vmx_invvpid_individual_addr()) + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + else + vpid_sync_context(vpid); +} + static inline void ept_sync_global(void) { __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7c857737b438..d33d890b605f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -18,6 +18,8 @@ #include "nested.h" #include "pmu.h" +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) + static struct kvm_event_hw_type_mapping intel_arch_events[] = { /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, @@ -150,6 +152,22 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return false; + + return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; +} + +static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) +{ + if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) + return NULL; + + return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -162,10 +180,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; + case MSR_IA32_PERF_CAPABILITIES: + ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); break; } @@ -184,35 +205,45 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } -static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; + u32 msr = msr_info->index; switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; + msr_info->data = pmu->fixed_ctr_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; + msr_info->data = pmu->global_status; return 0; case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; + msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; + msr_info->data = pmu->global_ovf_ctrl; + return 0; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); - *data = val & pmu->counter_bitmask[KVM_PMC_GP]; + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_GP]; return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { u64 val = pmc_read_counter(pmc); - *data = val & pmu->counter_bitmask[KVM_PMC_FIXED]; + msr_info->data = + val & pmu->counter_bitmask[KVM_PMC_FIXED]; return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; + msr_info->data = pmc->eventsel; return 0; } } @@ -258,9 +289,22 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? + (data & ~vmx_get_perf_capabilities()) : data) + return 1; + vcpu->arch.perf_capabilities = data; + return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { - if (!msr_info->host_initiated) + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { + if ((msr & MSR_PMC_FULL_WIDTH_BIT) && + (data & ~pmu->counter_bitmask[KVM_PMC_GP])) + return 1; + if (!msr_info->host_initiated && + !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); if (pmc->perf_event) @@ -300,6 +344,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; + vcpu->arch.perf_capabilities = 0; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -312,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; perf_get_x86_pmu_capability(&x86_pmu); + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 481ad879197b..5c0ff80b85c0 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -19,7 +19,7 @@ struct vmcs_hdr { struct vmcs { struct vmcs_hdr hdr; u32 abort; - char data[0]; + char data[]; }; DECLARE_PER_CPU(struct vmcs *, current_vmcs); diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 53dfb401316d..c8e51c004f78 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -115,10 +115,6 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), FIELD(CR0_READ_SHADOW, cr0_read_shadow), FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), FIELD(EXIT_QUALIFICATION, exit_qualification), FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), FIELD(GUEST_CR0, guest_cr0), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index d0c6df373f67..80232daf00ff 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -80,10 +80,7 @@ struct __packed vmcs12 { natural_width cr4_guest_host_mask; natural_width cr0_read_shadow; natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; + natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */ natural_width exit_qualification; natural_width guest_linear_address; natural_width guest_cr0; @@ -263,10 +260,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); + CHECK_OFFSET(dead_space, 376); CHECK_OFFSET(exit_qualification, 408); CHECK_OFFSET(guest_linear_address, 416); CHECK_OFFSET(guest_cr0, 424); diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 87f3f24fef37..e0a182cb3cdd 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -82,6 +82,9 @@ SYM_FUNC_START(vmx_vmexit) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */ + or $1, %_ASM_AX + pop %_ASM_AX .Lvmexit_skip_rsb: #endif @@ -163,13 +166,13 @@ SYM_FUNC_START(__vmx_vcpu_run) mov WORD_SIZE(%_ASM_SP), %_ASM_AX /* Save all guest registers, including RAX from the stack */ - __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) - mov %_ASM_CX, VCPU_RCX(%_ASM_AX) - mov %_ASM_DX, VCPU_RDX(%_ASM_AX) - mov %_ASM_BX, VCPU_RBX(%_ASM_AX) - mov %_ASM_BP, VCPU_RBP(%_ASM_AX) - mov %_ASM_SI, VCPU_RSI(%_ASM_AX) - mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + pop VCPU_RAX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) #ifdef CONFIG_X86_64 mov %r8, VCPU_R8 (%_ASM_AX) mov %r9, VCPU_R9 (%_ASM_AX) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 83050977490c..2b5ba6063a2d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -437,6 +437,11 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + static unsigned long host_idt_base; /* @@ -1306,10 +1311,12 @@ after_clear_sn: pi_set_on(pi_desc); } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; + struct vmcs *prev; if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); @@ -1328,16 +1335,28 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) local_irq_enable(); } - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + prev = per_cpu(current_vmcs, cpu); + if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - indirect_branch_prediction_barrier(); + + /* + * No indirect branch prediction barrier needed when switching + * the active VMCS within a guest, e.g. on nested VM-Enter. + * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + */ + if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + indirect_branch_prediction_barrier(); } if (!already_loaded) { void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* @@ -1364,15 +1383,14 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu); + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); vmx->host_debugctlmsr = get_debugctlmsr(); } @@ -1547,7 +1565,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rip; + unsigned long rip, orig_rip; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1559,8 +1577,17 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + orig_rip = kvm_rip_read(vcpu); + rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); +#ifdef CONFIG_X86_64 + /* + * We need to mask out the high 32 bits of RIP if not in 64-bit + * mode, but just finding out that we are in 64-bit mode is + * quite expensive. Only do it if there was a carry. + */ + if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) + rip = (u32)rip; +#endif kvm_rip_write(vcpu, rip); } else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) @@ -1713,17 +1740,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -1772,6 +1788,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + case MSR_IA32_PERF_CAPABILITIES: + msr->data = vmx_get_perf_capabilities(); + return 0; default: return 1; } @@ -1927,6 +1946,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } +static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, + u64 data) +{ +#ifdef CONFIG_X86_64 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return (u32)data; +#endif + return (unsigned long)data; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -1964,13 +1993,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_eip = data; + } vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_esp = data; + } vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: @@ -2188,6 +2221,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + unsigned long guest_owned_bits; + kvm_register_mark_available(vcpu, reg); switch (reg) { @@ -2201,10 +2236,22 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR0: + guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; + break; case VCPU_EXREG_CR3: if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; + case VCPU_EXREG_CR4: + guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; + break; default: WARN_ON_ONCE(1); break; @@ -2838,34 +2885,64 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { - int vpid = to_vmx(vcpu)->vpid; - - if (!vpid_sync_vcpu_addr(vpid, addr)) - vpid_sync_context(vpid); + struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * If VPIDs are not supported or enabled, then the above is a no-op. - * But we don't really need a TLB flush in that case anyway, because - * each VM entry/exit includes an implicit flush when VPID is 0. + * INVEPT must be issued when EPT is enabled, irrespective of VPID, as + * the CPU is not required to invalidate guest-physical mappings on + * VM-Entry, even if VPID is disabled. Guest-physical mappings are + * associated with the root EPT structure and not any particular VPID + * (INVVPID also isn't required to invalidate guest-physical mappings). */ + if (enable_ept) { + ept_sync_global(); + } else if (enable_vpid) { + if (cpu_has_vmx_invvpid_global()) { + vpid_sync_vcpu_global(); + } else { + vpid_sync_vcpu_single(vmx->vpid); + vpid_sync_vcpu_single(vmx->nested.vpid02); + } + } } -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + u64 root_hpa = vcpu->arch.mmu->root_hpa; - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; + /* No flush required if the current context is invalid. */ + if (!VALID_PAGE(root_hpa)) + return; + + if (enable_ept) + ept_sync_context(construct_eptp(vcpu, root_hpa)); + else if (!is_guest_mode(vcpu)) + vpid_sync_context(to_vmx(vcpu)->vpid); + else + vpid_sync_context(nested_get_vpid02(vcpu)); } -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + /* + * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vmx_flush_tlb_guest() for an explanation of why this is ok. + */ + vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); +} - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; +static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + /* + * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 + * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit + * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), + * i.e. no explicit INVVPID is necessary. + */ + vpid_sync_context(to_vmx(vcpu)->vpid); } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -2887,12 +2964,13 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (is_pae_paging(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } + if (WARN_ON_ONCE(!is_pae_paging(vcpu))) + return; + + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } @@ -2956,20 +3034,27 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } -static int get_ept_level(struct kvm_vcpu *vcpu) +static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; } +static int get_ept_level(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); + + return vmx_get_tdp_level(vcpu); +} + u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp = VMX_EPTP_MT_WB; @@ -2984,16 +3069,15 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; - guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(vcpu, cr3); + eptp = construct_eptp(vcpu, pgd); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@ -3004,16 +3088,15 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); } - /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */ - if (is_guest_mode(vcpu)) - update_guest_cr3 = false; - else if (!enable_unrestricted_guest && !is_paging(vcpu)) + if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; ept_load_pdptrs(vcpu); + } else { + guest_cr3 = pgd; } if (update_guest_cr3) @@ -3064,6 +3147,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; vcpu->arch.cr4 = cr4; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { @@ -3852,7 +3936,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (vcpu != kvm_get_running_vcpu() && + !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -4148,8 +4233,7 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE, 0); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); } #define VMX_XSS_EXIT_BITMAP 0 @@ -4454,31 +4538,54 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; + if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return false; - if (!enable_vnmi && - to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) - return 0; + if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return true; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); + return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_NMI)); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) - return false; + return -EBUSY; + + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return -EBUSY; + + return !vmx_nmi_blocked(vcpu); +} +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - return true; + return false; - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (to_vmx(vcpu)->nested.nested_run_pending) + return -EBUSY; + + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return -EBUSY; + + return !vmx_interrupt_blocked(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4519,10 +4626,8 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) return false; /* fall through */ case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - /* fall through */ + return !(vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: @@ -4572,13 +4677,13 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, */ static void kvm_machine_check(void) { -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_MCE) struct pt_regs regs = { .cs = 3, /* Fake ring 3 no matter what the guest ran on */ .flags = X86_EFLAGS_IF, }; - do_machine_check(®s, 0); + do_machine_check(®s); #endif } @@ -4617,7 +4722,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) u32 vect_info; vect_info = vmx->idt_vectoring_info; - intr_info = vmx->exit_intr_info; + intr_info = vmx_get_intr_info(vcpu); if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ @@ -4661,9 +4766,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); + cr2 = vmx_get_exit_qual(vcpu); /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } @@ -4674,18 +4779,16 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) switch (ex_no) { case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: @@ -4743,7 +4846,7 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; @@ -4834,7 +4937,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int err; int ret; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { @@ -4911,7 +5014,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) unsigned long exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@ -4929,16 +5032,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1; } } @@ -4969,15 +5070,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.dr6; -} - -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ -} - static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); @@ -5024,7 +5116,7 @@ static int handle_invd(struct kvm_vcpu *vcpu) static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); @@ -5056,7 +5148,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@ -5077,7 +5169,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@ -5087,7 +5179,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@ -5108,7 +5200,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@ -5158,7 +5250,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa_t gpa; u64 error_code; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5230,18 +5322,11 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) bool intr_window_requested; unsigned count = 130; - /* - * We should never reach the point where we are emulating L2 - * due to invalid guest state as that means we incorrectly - * allowed a nested VMEntry with an invalid vmcs12. - */ - WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { - if (intr_window_requested && vmx_interrupt_allowed(vcpu)) + if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); if (kvm_test_request(KVM_REQ_EVENT, vcpu)) @@ -5418,13 +5503,13 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { - kvm_inject_page_fault(vcpu, &e); + kvm_inject_emulated_page_fault(vcpu, &e); return 1; } @@ -5453,11 +5538,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) if (kvm_get_active_pcid(vcpu) == operand.pcid) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@ -5494,7 +5579,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) trace_kvm_pml_full(vcpu->vcpu_id); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, @@ -5513,14 +5598,22 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; + } + return EXIT_FASTPATH_NONE; +} + +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + handle_fastpath_preemption_timer(vcpu); return 1; } @@ -5608,8 +5701,8 @@ static const int kvm_vmx_max_exit_handlers = static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info1 = vmx_get_exit_qual(vcpu); + *info2 = vmx_get_intr_info(vcpu); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -5691,7 +5784,6 @@ void dump_vmcs(void) u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; u64 efer; - int i, n; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); @@ -5828,14 +5920,6 @@ void dump_vmcs(void) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); - n = vmcs_read32(CR3_TARGET_COUNT); - for (i = 0; i + 1 < n; i += 4) - pr_err("CR3 target%u=%016lx target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), - i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); - if (i < n) - pr_err("CR3 target%u=%016lx\n", - i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); @@ -5848,15 +5932,12 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before @@ -5867,6 +5948,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (enable_pml) vmx_flush_pml_buffer(vcpu); + /* + * We should never reach this point with a pending nested VM-Enter, and + * more specifically emulation of L2 due to invalid guest state (see + * below) should never happen as that means we incorrectly allowed a + * nested VM-Enter with an invalid vmcs12. + */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -5885,8 +5974,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, */ nested_mark_vmcs12_pages_dirty(vcpu); - if (nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (nested_vmx_reflect_vmexit(vcpu)) + return 1; } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { @@ -5933,7 +6022,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) { - if (vmx_interrupt_allowed(vcpu)) { + if (!vmx_interrupt_blocked(vcpu)) { vmx->loaded_vmcs->soft_vnmi_blocked = 0; } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { @@ -5950,10 +6039,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, } } - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } if (exit_reason >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; @@ -6107,7 +6194,15 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + + /* + * Flush the TLB, reloading the APIC access page will + * only do so if its physical address has changed, but + * the guest may have inserted a non-APIC mapping into + * the TLB while the APIC access page was disabled. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } break; case LAPIC_MODE_X2APIC: @@ -6121,12 +6216,32 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu)) { - vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); + struct page *page; + + /* Defer reload until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; + return; } + + if (!(secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; + + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + vmx_flush_tlb_current(vcpu); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) @@ -6244,16 +6359,16 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) { - vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + if (is_page_fault(intr_info)) { + vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(vmx->exit_intr_info)) { + } else if (is_machine_check(intr_info)) { kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6268,9 +6383,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned long tmp; #endif gate_desc *desc; - u32 intr_info; + u32 intr_info = vmx_get_intr_info(vcpu); - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6283,13 +6397,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) asm volatile( #ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" - "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" - "push $%c[ss]\n\t" + "mov %%rsp, %[sp]\n\t" + "and $-16, %%rsp\n\t" + "push %[ss]\n\t" "push %[sp]\n\t" #endif "pushf\n\t" - __ASM_SIZE(push) " $%c[cs]\n\t" + "push %[cs]\n\t" CALL_NOSPEC : #ifdef CONFIG_X86_64 @@ -6298,7 +6412,9 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) ASM_CALL_CONSTRAINT : [thunk_target]"r"(entry), +#ifdef CONFIG_X86_64 [ss]"i"(__KERNEL_DS), +#endif [cs]"i"(__KERNEL_CS) ); @@ -6306,8 +6422,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6315,12 +6430,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); - else if (!is_guest_mode(vcpu) && - vmx->exit_reason == EXIT_REASON_MSR_WRITE) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } -static bool vmx_has_emulated_msr(int index) +static bool vmx_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -6351,11 +6463,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -6522,13 +6631,27 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + switch (to_vmx(vcpu)->exit_reason) { + case EXIT_REASON_MSR_WRITE: + return handle_fastpath_set_msr_irqoff(vcpu); + case EXIT_REASON_PREEMPTION_TIMER: + return handle_fastpath_preemption_timer(vcpu); + default: + return EXIT_FASTPATH_NONE; + } +} + bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { + fastpath_t exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; +reenter_guest: /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) @@ -6537,7 +6660,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) - return; + return EXIT_FASTPATH_NONE; if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -6577,11 +6700,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_load_guest_xsave_state(vcpu); - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - pt_guest_enter(vmx); if (vcpu_to_pmu(vcpu)->version) @@ -6662,44 +6780,54 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; + vmx_register_cache_reset(vcpu); pt_guest_exit(vmx); - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = rdpkru(); - if (vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vmx->host_pkru); - } - kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; - vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); - if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) + if (unlikely(vmx->fail)) { + vmx->exit_reason = 0xdead; + return EXIT_FASTPATH_NONE; + } + + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); - if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; + trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + + if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + exit_fastpath = vmx_exit_handlers_fastpath(vcpu); + if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) { + if (!kvm_vcpu_exit_request(vcpu)) { + /* + * FIXME: this goto should be a loop in vcpu_enter_guest, + * but it would incur the cost of a retpoline for now. + * Revisit once static calls are available. + */ + if (vcpu->arch.apicv_active) + vmx_sync_pir_to_irr(vcpu); + goto reenter_guest; + } + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + } + + return exit_fastpath; } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -7169,6 +7297,9 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); + + if (vmx_waitpkg_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -7284,10 +7415,6 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; - if (kvm_mwait_in_guest(vcpu->kvm) || - kvm_can_post_timer_interrupt(vcpu)) - return -EOPNOTSUPP; - vmx = to_vmx(vcpu); tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); @@ -7630,12 +7757,12 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } -static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - return 1; + return -EBUSY; + return !is_smm(vcpu); } static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -7672,9 +7799,9 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { - return 0; + /* RSM will cause a vmexit anyway. */ } static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) @@ -7687,6 +7814,16 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.vmxon; } +static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; + + if (hrtimer_try_to_cancel(timer) == 1) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } +} + static void hardware_unsetup(void) { if (nested) @@ -7731,8 +7868,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, @@ -7740,16 +7875,16 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, - .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .tlb_flush = vmx_flush_tlb, + .tlb_flush_all = vmx_flush_tlb_all, + .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, + .tlb_flush_guest = vmx_flush_tlb_guest, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@ -7784,7 +7919,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, + .get_tdp_level = vmx_get_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, @@ -7793,7 +7928,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .load_mmu_pgd = vmx_load_mmu_pgd, @@ -7815,6 +7949,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .post_block = vmx_post_block, .pmu_ops = &intel_pmu_ops, + .nested_ops = &vmx_nested_ops, .update_pi_irte = vmx_update_pi_irte, @@ -7830,14 +7965,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, }; static __init int hardware_setup(void) @@ -7936,11 +8066,11 @@ static __init int hardware_setup(void) if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PT_PDPE_LEVEL; + ept_lpage_level = PG_LEVEL_1G; else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PT_DIRECTORY_LEVEL; + ept_lpage_level = PG_LEVEL_2M; else - ept_lpage_level = PT_PAGE_TABLE_LEVEL; + ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, ept_lpage_level); /* @@ -8000,8 +8130,7 @@ static __init int hardware_setup(void) nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); - r = nested_vmx_hardware_setup(&vmx_x86_ops, - kvm_vmx_exit_handlers); + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index aab9df55336e..672c28f17e49 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -8,6 +8,7 @@ #include <asm/intel_pt.h> #include "capabilities.h" +#include "kvm_cache_regs.h" #include "ops.h" #include "vmcs.h" @@ -136,6 +137,7 @@ struct nested_vmx { bool vmcs02_initialized; bool change_vmcs01_virtual_apic_mode; + bool reload_vmcs01_apic_access_page; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -167,6 +169,8 @@ struct nested_vmx { u16 posted_intr_nv; struct hrtimer preemption_timer; + u64 preemption_timer_deadline; + bool has_preemption_timer_deadline; bool preemption_timer_expired; /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ @@ -208,6 +212,7 @@ struct vcpu_vmx { */ bool guest_state_loaded; + unsigned long exit_qualification; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -317,8 +322,8 @@ struct kvm_vmx { }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); -void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, + struct loaded_vmcs *buddy); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); @@ -341,6 +346,8 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); @@ -441,9 +448,18 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) { - vmx->segment_cache.bitmask = 0; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) + | (1 << VCPU_EXREG_CR0) + | (1 << VCPU_EXREG_CR3) + | (1 << VCPU_EXREG_CR4) + | (1 << VCPU_EXREG_EXIT_INFO_1) + | (1 << VCPU_EXREG_EXIT_INFO_2)); + vcpu->arch.regs_dirty = 0; } static inline u32 vmx_vmentry_ctrl(void) @@ -486,6 +502,28 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + } + return vmx->exit_qualification; +} + +static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + } + return vmx->exit_intr_info; +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -500,24 +538,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); -} - static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) { vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3bf2ecafd027..9e41b5135340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -18,6 +18,7 @@ #include <linux/kvm_host.h> #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" @@ -97,9 +98,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -194,45 +192,46 @@ u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, + VCPU_STAT("pf_fixed", pf_fixed), + VCPU_STAT("pf_guest", pf_guest), + VCPU_STAT("tlb_flush", tlb_flush), + VCPU_STAT("invlpg", invlpg), + VCPU_STAT("exits", exits), + VCPU_STAT("io_exits", io_exits), + VCPU_STAT("mmio_exits", mmio_exits), + VCPU_STAT("signal_exits", signal_exits), + VCPU_STAT("irq_window", irq_window_exits), + VCPU_STAT("nmi_window", nmi_window_exits), + VCPU_STAT("halt_exits", halt_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hypercalls", hypercalls), + VCPU_STAT("request_irq", request_irq_exits), + VCPU_STAT("irq_exits", irq_exits), + VCPU_STAT("host_state_reload", host_state_reload), + VCPU_STAT("fpu_reload", fpu_reload), + VCPU_STAT("insn_emulation", insn_emulation), + VCPU_STAT("insn_emulation_fail", insn_emulation_fail), + VCPU_STAT("irq_injections", irq_injections), + VCPU_STAT("nmi_injections", nmi_injections), + VCPU_STAT("req_event", req_event), + VCPU_STAT("l1d_flush", l1d_flush), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), + VM_STAT("mmu_pte_write", mmu_pte_write), + VM_STAT("mmu_pte_updated", mmu_pte_updated), + VM_STAT("mmu_pde_zapped", mmu_pde_zapped), + VM_STAT("mmu_flooded", mmu_flooded), + VM_STAT("mmu_recycled", mmu_recycled), + VM_STAT("mmu_cache_miss", mmu_cache_miss), + VM_STAT("mmu_unsync", mmu_unsync), + VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("largepages", lpages, .mode = 0444), + VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), + VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), { NULL } }; @@ -261,7 +260,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } @@ -572,11 +571,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, - unsigned long payload) +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } +EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@ -611,15 +611,28 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu->inject_page_fault(vcpu, fault); + struct kvm_mmu *fault_mmu; + WARN_ON_ONCE(fault->vector != PF_VECTOR); + + fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : + vcpu->arch.walk_mmu; + + /* + * Invalidate the TLB entry for the faulting address, if it exists, + * else the access will fault indefinitely (and to emulate hardware). + */ + if ((fault->error_code & PFERR_PRESENT_MASK) && + !(fault->error_code & PFERR_RSVD_MASK)) + kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, + fault_mmu->root_hpa); + fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -836,11 +849,25 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } + + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && + vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.host_pkru); + } + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -926,19 +953,6 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); __reserved_bits; \ }) -static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) -{ - u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - - if (kvm_cpu_cap_has(X86_FEATURE_LA57)) - reserved_bits &= ~X86_CR4_LA57; - - if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) - reserved_bits &= ~X86_CR4_UMIP; - - return reserved_bits; -} - static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) @@ -1006,7 +1020,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } return 0; } @@ -1018,7 +1032,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -1058,13 +1072,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) } } -static void kvm_update_dr6(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); -} - -static void kvm_update_dr7(struct kvm_vcpu *vcpu) +void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -1077,6 +1085,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1103,7 +1112,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - kvm_update_dr6(vcpu); break; case 5: /* fall through */ @@ -1139,10 +1147,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) case 4: /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *val = vcpu->arch.dr6; - else - *val = kvm_x86_ops.get_dr6(vcpu); + *val = vcpu->arch.dr6; break; case 5: /* fall through */ @@ -1241,13 +1246,18 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1314,6 +1324,7 @@ static const u32 msr_based_features_all[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; @@ -1572,6 +1583,13 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +{ + return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || + need_resched() || signal_pending(current); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); + /* * The fast path for frequent and performance sensitive wrmsr emulation, * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces @@ -1600,27 +1618,44 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data return 1; } -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) +{ + if (!kvm_can_use_hv_timer(vcpu)) + return 1; + + kvm_set_lapic_tscdeadline_msr(vcpu, data); + return 0; +} + +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - int ret = 0; + fastpath_t ret = EXIT_FASTPATH_NONE; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_EXIT_HANDLED; + } + break; + case MSR_IA32_TSCDEADLINE: + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_tscdeadline(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_REENTER_GUEST; + } break; default: - return EXIT_FASTPATH_NONE; + break; } - if (!ret) { + if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); - return EXIT_FASTPATH_SKIP_EMUL_INS; - } - return EXIT_FASTPATH_NONE; + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -1909,7 +1944,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.l1_tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1951,14 +1986,13 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); - - return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + vcpu->arch.l1_tsc_offset = offset; vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } @@ -2083,7 +2117,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -2645,29 +2679,54 @@ out: return r; } +static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} + static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 3:5 are reserved, Should be zero */ - if (data & 0x38) + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) return 1; - vcpu->arch.apf.msr_val = data; + vcpu->arch.apf.msr_en_val = data; - if (!(data & KVM_ASYNC_PF_ENABLED)) { + if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) + sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + kvm_async_pf_wakeup_all(vcpu); + + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + return 0; } @@ -2677,10 +2736,16 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush_all(vcpu); +} + +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_guest(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2706,7 +2771,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + kvm_vcpu_flush_tlb_guest(vcpu); vcpu->arch.st.preempted = 0; @@ -2883,6 +2948,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_INT: + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (data & 0x1) { + vcpu->arch.apf.pageready_pending = false; + kvm_check_async_pf_completion(vcpu); + } + break; case MSR_KVM_STEAL_TIME: if (unlikely(!sched_info_on())) @@ -2940,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3060,6 +3137,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_F15H_EX_CFG: + /* + * Intel Sandy Bridge CPUs must support the RAPL (running average power + * limit) MSRs. Just return 0, as we do not want to expose the host + * data here. Do not conditionalize this on CPUID, as KVM does not do + * so for existing CPU-specific MSRs. + */ + case MSR_RAPL_POWER_UNIT: + case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ + case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ + case MSR_PKG_ENERGY_STATUS: /* Total package */ + case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: @@ -3068,7 +3156,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: @@ -3146,7 +3234,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + msr_info->data = vcpu->arch.apf.msr_int_val; + break; + case MSR_KVM_ASYNC_PF_ACK: + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: msr_info->data = vcpu->arch.st.msr_val; @@ -3184,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3230,7 +3326,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); if (!ignore_msrs) { vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); @@ -3360,6 +3456,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: @@ -3374,6 +3471,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -3427,14 +3525,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops.get_nested_state ? - kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops.nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; default: break; @@ -3559,6 +3657,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops.vcpu_load(vcpu, cpu); + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -3752,7 +3853,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; @@ -4010,7 +4111,6 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); @@ -4220,9 +4320,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_enable_evmcs) + if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; - r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4537,7 +4637,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops.get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4545,8 +4645,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@ -4567,7 +4667,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops.set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@ -4580,7 +4680,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE - | KVM_STATE_NESTED_EVMCS)) + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING + | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ @@ -4589,7 +4690,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -5049,10 +5150,13 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); +set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@ -5072,10 +5176,13 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); +set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@ -5230,6 +5337,10 @@ static void kvm_init_msr_list(void) if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + continue; + break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) @@ -5247,7 +5358,7 @@ static void kvm_init_msr_list(void) !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) @@ -5262,7 +5373,7 @@ static void kvm_init_msr_list(void) if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; - } + break; default: break; } @@ -6391,7 +6502,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, @@ -6654,7 +6765,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; @@ -6714,9 +6825,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); *r = 1; return true; } @@ -7659,14 +7768,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu) +static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; + bool can_inject = true; /* try to reinject previous events if any */ - if (vcpu->arch.exception.injected) + if (vcpu->arch.exception.injected) { kvm_x86_ops.queue_exception(vcpu); + can_inject = false; + } /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7682,22 +7794,28 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * fully complete the previous instruction. */ else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) + if (vcpu->arch.nmi_injected) { kvm_x86_ops.set_nmi(vcpu); - else if (vcpu->arch.interrupt.injected) + can_inject = false; + } else if (vcpu->arch.interrupt.injected) { kvm_x86_ops.set_irq(vcpu); + can_inject = false; + } } + WARN_ON_ONCE(vcpu->arch.exception.injected && + vcpu->arch.exception.pending); + /* * Call check_nested_events() even if we reinjected a previous event * in order for caller to determine if it should require immediate-exit * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); - if (r != 0) - return r; + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); + if (r < 0) + goto busy; } /* try to inject new event if pending */ @@ -7706,7 +7824,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); - WARN_ON_ONCE(vcpu->arch.exception.injected); vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = true; @@ -7715,16 +7832,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) X86_EFLAGS_RF); if (vcpu->arch.exception.nr == DB_VECTOR) { - /* - * This code assumes that nSVM doesn't use - * check_nested_events(). If it does, the - * DR6/DR7 changes should happen before L1 - * gets a #VMEXIT for an intercepted #DB in - * L2. (Under VMX, on the other hand, the - * DR6/DR7 changes should not happen in the - * event of a VM-exit to L1 for an intercepted - * #DB in L2.) - */ kvm_deliver_exception_payload(vcpu); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; @@ -7733,42 +7840,72 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) } kvm_x86_ops.queue_exception(vcpu); + can_inject = false; } - /* Don't consider new event if we re-injected an event */ - if (kvm_event_needs_reinjection(vcpu)) - return 0; + /* + * Finally, inject interrupt events. If an event cannot be injected + * due to architectural conditions (e.g. IF=0) a window-open exit + * will re-request KVM_REQ_EVENT. Sometimes however an event is pending + * and can architecturally be injected, but we cannot do it right now: + * an interrupt could have arrived just now and we have to inject it + * as a vmexit, or there could already an event in the queue, which is + * indicated by can_inject. In that case we request an immediate exit + * in order to make progress and get back here for another iteration. + * The kvm_x86_ops hooks communicate this by returning -EBUSY. + */ + if (vcpu->arch.smi_pending) { + r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; + enter_smm(vcpu); + can_inject = false; + } else + kvm_x86_ops.enable_smi_window(vcpu); + } - if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops.smi_allowed(vcpu)) { - vcpu->arch.smi_pending = false; - ++vcpu->arch.smi_count; - enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops.set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu)) { - /* - * Because interrupts can be injected asynchronously, we are - * calling check_nested_events again here to avoid a race condition. - * See https://lkml.org/lkml/2014/7/2/60 for discussion about this - * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? - */ - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { - r = kvm_x86_ops.check_nested_events(vcpu); - if (r != 0) - return r; + if (vcpu->arch.nmi_pending) { + r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops.set_nmi(vcpu); + can_inject = false; + WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); } - if (kvm_x86_ops.interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); + if (vcpu->arch.nmi_pending) + kvm_x86_ops.enable_nmi_window(vcpu); + } + + if (kvm_cpu_has_injectable_intr(vcpu)) { + r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); kvm_x86_ops.set_irq(vcpu); + WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); } + if (kvm_cpu_has_injectable_intr(vcpu)) + kvm_x86_ops.enable_irq_window(vcpu); } - return 0; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + *req_immediate_exit = true; + + WARN_ON(vcpu->arch.exception.pending); + return; + +busy: + *req_immediate_exit = true; + return; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -8037,7 +8174,7 @@ void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, zalloc_cpumask_var(&cpus, GFP_ATOMIC); kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - vcpu_bitmap, cpus); + NULL, vcpu_bitmap, cpus); free_cpumask_var(cpus); } @@ -8067,6 +8204,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + struct kvm_vcpu *except; unsigned long old, new, expected; if (!kvm_x86_ops.check_apicv_inhibit_reasons || @@ -8091,7 +8229,17 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops.pre_update_apicv_exec_ctrl) kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); - kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + + /* + * Sending request to update APICV for all other vcpus, + * while update the calling vcpu immediately instead of + * waiting for another #VMEXIT to handle the request. + */ + except = kvm_get_running_vcpu(); + kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, + except); + if (except) + kvm_vcpu_update_apicv(except); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -8148,24 +8296,13 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; - if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops.set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + kvm_x86_ops.set_apic_access_page_addr(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8185,13 +8322,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; + fastpath_t exit_fastpath; bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8213,8 +8350,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -8287,6 +8433,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); + if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) + kvm_check_async_pf_completion(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8297,32 +8445,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu) != 0) - req_immediate_exit = true; - else { - /* Enable SMI/NMI/IRQ window open exits if needed. - * - * SMIs have three cases: - * 1) They can be nested, and then there is nothing to - * do here because RSM will cause a vmexit anyway. - * 2) There is an ISA-specific reason why SMI cannot be - * injected, and the moment when this changes can be - * intercepted. - * 3) Or the SMI can be pending because - * inject_pending_event has completed the injection - * of an IRQ or NMI from the previous vmexit, and - * then we request an immediate exit to inject the - * SMI. - */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops.enable_smi_window(vcpu)) - req_immediate_exit = true; - if (vcpu->arch.nmi_pending) - kvm_x86_ops.enable_nmi_window(vcpu); - if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops.enable_irq_window(vcpu); - WARN_ON(vcpu->arch.exception.pending); - } + inject_pending_event(vcpu, &req_immediate_exit); + if (req_int_win) + kvm_x86_ops.enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -8370,8 +8495,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) kvm_x86_ops.sync_pir_to_irr(vcpu); - if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) - || need_resched() || signal_pending(current)) { + if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -8403,7 +8527,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops.run(vcpu); + exit_fastpath = kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8415,7 +8539,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); - kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } @@ -8435,7 +8558,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu); /* * Consume any pending interrupts, including the possible source of @@ -8482,6 +8605,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) return r; cancel_injection: + if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); @@ -8524,8 +8649,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) - kvm_x86_ops.check_nested_events(vcpu); + if (is_guest_mode(vcpu)) + kvm_x86_ops.nested_ops->check_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8561,8 +8686,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -8707,8 +8830,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@ -8726,18 +8850,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8767,7 +8891,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@ -9359,9 +9483,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) } fx_init(vcpu); - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -9476,14 +9599,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); vcpu->arch.cr2 = 0; kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; + vcpu->arch.apf.msr_en_val = 0; + vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); @@ -9658,7 +9781,9 @@ int kvm_arch_hardware_setup(void *opaque) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; - cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has if (kvm_has_tsc_control) { /* @@ -9690,7 +9815,8 @@ int kvm_arch_check_processor_compat(void *opaque) WARN_ON(!irqs_disabled()); - if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; return ops->check_processor_compatibility(); @@ -10018,7 +10144,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); return; } @@ -10058,7 +10184,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? - PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + PG_LEVEL_2M : PG_LEVEL_4K; /* * If we're with initial-all-set, we don't need @@ -10160,11 +10286,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && !is_smm(vcpu))) + (vcpu->arch.smi_pending && + kvm_x86_ops.smi_allowed(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10175,6 +10302,11 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_hv_has_stimer_pending(vcpu)) return true; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + return true; + return false; } @@ -10211,7 +10343,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10276,12 +10408,14 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); } static inline u32 kvm_async_pf_next_probe(u32 key) { - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); } static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -10299,7 +10433,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) int i; u32 key = kvm_async_pf_hash_fn(gfn); - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + for (i = 0; i < ASYNC_PF_PER_VCPU && (vcpu->arch.apf.gfns[key] != gfn && vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); @@ -10317,6 +10451,10 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) u32 i, j, k; i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + + if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) + return; + while (true) { vcpu->arch.apf.gfns[i] = ~0; do { @@ -10335,18 +10473,32 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) } } -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) { + u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, + sizeof(reason)); } -static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) { + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, - sizeof(u32)); + return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &token, offset, sizeof(token)); +} + +static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) +{ + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + u32 val; + + if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &val, offset, sizeof(val))) + return false; + + return !val; } static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) @@ -10354,9 +10506,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops.get_cpl(vcpu) == 0)) + if (!kvm_pv_async_pf_enabled(vcpu) || + (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10376,7 +10527,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_arch_interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -10388,7 +10539,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + !apf_put_user_notpresent(vcpu)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -10412,8 +10563,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - struct x86_exception fault; - u32 val; + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vcpu->arch.apf.vec + }; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -10421,39 +10574,29 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_get_user(vcpu, &val)) { - if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && - vcpu->arch.exception.pending && - vcpu->arch.exception.nr == PF_VECTOR && - !apf_put_user(vcpu, 0)) { - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = false; - vcpu->arch.exception.nr = 0; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); - } + if (kvm_pv_async_pf_enabled(vcpu) && + !apf_put_user_ready(vcpu, work->arch.token)) { + vcpu->arch.apf.pageready_pending = true; + kvm_apic_set_irq(vcpu, &irq, NULL); } + vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_APF_READY, vcpu); + if (!vcpu->arch.apf.pageready_pending) + kvm_vcpu_kick(vcpu); +} + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return kvm_can_do_async_pf(vcpu); + return apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b968acc0516f..6eb62e97e59f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -125,6 +125,12 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_current(vcpu); +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -268,7 +274,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, bool kvm_vector_hashing_enabled(void); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 supported_xcr0; @@ -358,5 +364,6 @@ static inline bool kvm_dr7_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4742e8fa7ee7..d1d768912368 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -153,7 +153,7 @@ SYM_FUNC_START(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -436,7 +436,7 @@ SYM_FUNC_START(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index c66c8b00f236..ee63d7576fd2 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -10,7 +10,7 @@ #include <asm/smap.h> /** - * csum_partial_copy_from_user - Copy and checksum from user space. + * csum_and_copy_from_user - Copy and checksum from user space. * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. @@ -21,13 +21,13 @@ * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp) { might_sleep(); *errp = 0; - if (!likely(access_ok(src, len))) + if (!user_access_begin(src, len)) goto out_err; /* @@ -42,8 +42,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - if (__get_user(val16, (const __u16 __user *)src)) - goto out_err; + unsafe_get_user(val16, (const __u16 __user *)src, out); *(__u16 *)dst = val16; isum = (__force __wsum)add32_with_carry( @@ -53,25 +52,26 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } - stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); - clac(); + user_access_end(); if (unlikely(*errp)) goto out_err; return isum; +out: + user_access_end(); out_err: *errp = -EFAULT; memset(dst, 0, len); return isum; } -EXPORT_SYMBOL(csum_partial_copy_from_user); +EXPORT_SYMBOL(csum_and_copy_from_user); /** - * csum_partial_copy_to_user - Copy and checksum to user space. + * csum_and_copy_to_user - Copy and checksum to user space. * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. @@ -82,14 +82,14 @@ EXPORT_SYMBOL(csum_partial_copy_from_user); * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_to_user(const void *src, void __user *dst, +csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { __wsum ret; might_sleep(); - if (unlikely(!access_ok(dst, len))) { + if (!user_access_begin(dst, len)) { *errp = -EFAULT; return 0; } @@ -100,9 +100,7 @@ csum_partial_copy_to_user(const void *src, void __user *dst, isum = (__force __wsum)add32_with_carry( (__force unsigned)isum, val16); - *errp = __put_user(val16, (__u16 __user *)dst); - if (*errp) - return isum; + unsafe_put_user(val16, (__u16 __user *)dst, out); src += 2; dst += 2; len -= 2; @@ -110,13 +108,16 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - stac(); ret = csum_partial_copy_generic(src, (void __force *)dst, len, isum, NULL, errp); - clac(); + user_access_end(); return ret; +out: + user_access_end(); + *errp = -EFAULT; + return isum; } -EXPORT_SYMBOL(csum_partial_copy_to_user); +EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index c126571e5e2e..65d15df6212d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -27,9 +27,20 @@ # include <asm/smp.h> #endif +static void delay_loop(u64 __loops); + +/* + * Calibration and selection of the delay mechanism happens only once + * during boot. + */ +static void (*delay_fn)(u64) __ro_after_init = delay_loop; +static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init; + /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static void delay_loop(u64 __loops) { + unsigned long loops = (unsigned long)__loops; + asm volatile( " test %0,%0 \n" " jz 3f \n" @@ -49,9 +60,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long __loops) +static void delay_tsc(u64 cycles) { - u64 bclock, now, loops = __loops; + u64 bclock, now; int cpu; preempt_disable(); @@ -59,7 +70,7 @@ static void delay_tsc(unsigned long __loops) bclock = rdtsc_ordered(); for (;;) { now = rdtsc_ordered(); - if ((now - bclock) >= loops) + if ((now - bclock) >= cycles) break; /* Allow RT tasks to run */ @@ -77,7 +88,7 @@ static void delay_tsc(unsigned long __loops) * counter for this CPU. */ if (unlikely(cpu != smp_processor_id())) { - loops -= (now - bclock); + cycles -= (now - bclock); cpu = smp_processor_id(); bclock = rdtsc_ordered(); } @@ -86,65 +97,96 @@ static void delay_tsc(unsigned long __loops) } /* + * On Intel the TPAUSE instruction waits until any of: + * 1) the TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void delay_halt_tpause(u64 start, u64 cycles) +{ + u64 until = start + cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + /* + * Hard code the deeper (C0.2) sleep state because exit latency is + * small compared to the "microseconds" that usleep() will delay. + */ + __tpause(TPAUSE_C02_STATE, edx, eax); +} + +/* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that - * counts with TSC frequency. The input value is the loop of the - * counter, it will exit when the timer expires. + * counts with TSC frequency. The input value is the number of TSC cycles + * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(unsigned long __loops) +static void delay_halt_mwaitx(u64 unused, u64 cycles) { - u64 start, end, delay, loops = __loops; + u64 delay; + + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); + /* + * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * variable as the monitor target. + */ + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not + * enter any deep C-state and we use it here in delay() to minimize + * wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); +} + +/* + * Call a vendor specific function to delay for a given amount of time. Because + * these functions may return earlier than requested, check for actual elapsed + * time and call again until done. + */ +static void delay_halt(u64 __cycles) +{ + u64 start, end, cycles = __cycles; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there * is a store on the memory monitored by MONITORX. */ - if (loops == 0) + if (!cycles) return; start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); - - /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly - * accessed per-cpu variable as the monitor target. - */ - __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); - - /* - * AMD, like Intel's MWAIT version, supports the EAX hint and - * EAX=0xf0 means, do not enter any deep C-state and we use it - * here in delay() to minimize wakeup latency. - */ - __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); - + delay_halt_fn(start, cycles); end = rdtsc_ordered(); - if (loops <= end - start) + if (cycles <= end - start) break; - loops -= end - start; - + cycles -= end - start; start = end; } } -/* - * Since we calibrate only once at boot, this - * function should be set once at boot and not changed - */ -static void (*delay_fn)(unsigned long) = delay_loop; - -void use_tsc_delay(void) +void __init use_tsc_delay(void) { if (delay_fn == delay_loop) delay_fn = delay_tsc; } +void __init use_tpause_delay(void) +{ + delay_halt_fn = delay_halt_tpause; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { - delay_fn = delay_mwaitx; + delay_halt_fn = delay_halt_mwaitx; + delay_fn = delay_halt; } int read_current_timer(unsigned long *timer_val) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 363ec132df7e..b4c43a9b1483 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,15 +7,31 @@ #include <asm/alternative-asm.h> #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> +#include <asm/frame.h> .macro THUNK reg .section .text.__x86.indirect_thunk + .align 32 SYM_FUNC_START(__x86_indirect_thunk_\reg) - CFI_STARTPROC - JMP_NOSPEC %\reg - CFI_ENDPROC + JMP_NOSPEC \reg SYM_FUNC_END(__x86_indirect_thunk_\reg) + +SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop_\@ +.Lspec_trap_\@: + UNWIND_HINT_EMPTY + pause + lfence + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov %\reg, (%_ASM_SP) + UNWIND_HINT_RET_OFFSET + ret +SYM_FUNC_END(__x86_retpoline_\reg) + .endm /* @@ -24,25 +40,24 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) * only see one instance of "__x86_indirect_thunk_\reg" rather * than one per register with the correct names. So we do it * the simple and nasty way... + * + * Worse, you can only have a single EXPORT_SYMBOL per line, + * and CPP can't insert newlines, so we have to repeat everything + * at least twice. */ -#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) - -GENERATE_THUNK(_ASM_AX) -GENERATE_THUNK(_ASM_BX) -GENERATE_THUNK(_ASM_CX) -GENERATE_THUNK(_ASM_DX) -GENERATE_THUNK(_ASM_SI) -GENERATE_THUNK(_ASM_DI) -GENERATE_THUNK(_ASM_BP) -#ifdef CONFIG_64BIT -GENERATE_THUNK(r8) -GENERATE_THUNK(r9) -GENERATE_THUNK(r10) -GENERATE_THUNK(r11) -GENERATE_THUNK(r12) -GENERATE_THUNK(r13) -GENERATE_THUNK(r14) -GENERATE_THUNK(r15) -#endif + +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) + +#undef GEN +#define GEN(reg) THUNK reg +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 56f9189bbadb..770b613790b3 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -4,9 +4,9 @@ #include <linux/percpu.h> #include <linux/kallsyms.h> #include <linux/kcore.h> +#include <linux/pgtable.h> #include <asm/cpu_entry_area.h> -#include <asm/pgtable.h> #include <asm/fixmap.h> #include <asm/desc.h> @@ -17,7 +17,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif @@ -107,19 +107,16 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) */ cea_map_stack(DF); cea_map_stack(NMI); - cea_map_stack(DB1); cea_map_stack(DB); cea_map_stack(MCE); } #else static inline void percpu_setup_exception_stacks(unsigned int cpu) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(cpu); cea_map_percpu_pages(&cea->doublefault_stack, &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); -#endif } #endif diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 4a3b62f780b4..092ea436c7e6 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -3,7 +3,7 @@ #include <linux/efi.h> #include <linux/module.h> #include <linux/seq_file.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> static int ptdump_show(struct seq_file *m, void *v) { diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..e1b599ecbbc2 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -19,7 +19,6 @@ #include <linux/ptdump.h> #include <asm/e820/types.h> -#include <asm/pgtable.h> /* * The dumper groups pagetable entries of the same type into one, and for @@ -249,10 +248,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -261,7 +272,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; @@ -270,16 +281,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; - - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +379,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..66be9bd60307 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -190,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,61 +208,13 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -329,96 +279,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -554,21 +414,13 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } +/* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - unsigned long nr; - - /* - * Pentium F0 0F C7 C8 bug workaround: - */ - if (boot_cpu_has_bug(X86_BUG_F00F)) { - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return 1; - } + if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + handle_invalid_op(regs); + return 1; } #endif return 0; @@ -926,6 +778,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, force_sig_fault(SIGSEGV, si_code, (void __user *)address); + local_irq_disable(); + return; } @@ -951,7 +805,7 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } @@ -1005,7 +859,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); - * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ @@ -1257,29 +1111,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1394,15 +1225,15 @@ void do_user_addr_fault(struct pt_regs *regs, * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_sem might deadlock attempting + * those areas which also holds mmap_lock might deadlock attempting * to validate the fault against the address space. * * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_sem, and + * 1. Failed to acquire mmap_lock, and * 2. The access did not originate in userspace. */ - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!mmap_read_trylock(mm))) { if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from @@ -1412,7 +1243,7 @@ void do_user_addr_fault(struct pt_regs *regs, return; } retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } else { /* * The above down_read_trylock() might have succeeded in @@ -1452,9 +1283,9 @@ good_area: * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if - * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * - * Note that handle_userfault() may also release and reacquire mmap_sem + * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to @@ -1473,7 +1304,7 @@ good_area: } /* - * If we need to retry the mmap_sem has already been released, + * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ @@ -1483,7 +1314,7 @@ good_area: goto retry; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, hw_error_code, address, fault); return; @@ -1518,20 +1349,74 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, trace_page_fault_kernel(address, regs, error_code); } -dotraplinkage void -do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) +static __always_inline void +handle_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - prefetchw(¤t->mm->mmap_sem); - trace_page_fault_entries(regs, hw_error_code, address); + trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) - do_kern_addr_fault(regs, hw_error_code, address); - else - do_user_addr_fault(regs, hw_error_code, address); + if (unlikely(fault_in_kernel_space(address))) { + do_kern_addr_fault(regs, error_code, address); + } else { + do_user_addr_fault(regs, error_code, address); + /* + * User address page fault handling might have reenabled + * interrupts. Fixing up all potential exit points of + * do_user_addr_fault() and its leaf functions is just not + * doable w/o creating an unholy mess or turning the code + * upside down. + */ + local_irq_disable(); + } +} + +DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) +{ + unsigned long address = read_cr2(); + bool rcu_exit; + + prefetchw(¤t->mm->mmap_lock); + + /* + * KVM has two types of events that are, logically, interrupts, but + * are unfortunately delivered using the #PF vector. These events are + * "you just accessed valid memory, but the host doesn't have it right + * now, so I'll put you to sleep if you continue" and "that memory + * you tried to access earlier is available now." + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + * + * The async #PF handling code takes care of idtentry handling + * itself. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + + /* + * Entry handling for valid #PF from kernel mode is slightly + * different: RCU is already watching and rcu_irq_enter() must not + * be invoked because a kernel fault on a user space address might + * sleep. + * + * In case the fault hit a RCU idle region the conditional entry + * code reenabled RCU to avoid subsequent wreckage which helps + * debugability. + */ + rcu_exit = idtentry_enter_cond_rcu(regs); + + instrumentation_begin(); + handle_page_fault(regs, error_code, address); + instrumentation_end(); + + idtentry_exit_cond_rcu(regs, rcu_exit); } -NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 0a1898b8552e..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,44 +4,11 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); - -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high_prot); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 5bfd5aef5378..cf5781142716 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -181,28 +181,21 @@ get_unmapped_area: #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 -static __init int setup_hugepagesz(char *opt) +bool __init arch_hugetlb_valid_size(unsigned long size) { - unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", - ps >> 20); - return 0; - } - return 1; + if (size == PMD_SIZE) + return true; + else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1bba16c5742b..001dd7dc829f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -49,7 +49,7 @@ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, @@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; -EXPORT_SYMBOL(__cachemode2pte_tbl); -uint8_t __pte2cachemode_tbl[8] = { +unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +EXPORT_SYMBOL(cachemode2protval); + +static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, @@ -69,7 +76,22 @@ uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -EXPORT_SYMBOL(__pte2cachemode_tbl); + +/* Check that the write-protect PAT entry is set for write-protect */ +bool x86_has_pat_wp(void) +{ + return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; +} + +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; @@ -121,8 +143,6 @@ __ref void *alloc_low_pages(unsigned int num) } else { pfn = pgt_buf_end; pgt_buf_end += num; - printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", - pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); } for (i = 0; i < num; i++) { @@ -172,6 +192,19 @@ struct map_range { static int page_size_mask; +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. Invoked on the boot CPU. + */ +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static void __init probe_page_size_mask(void) { /* @@ -647,6 +680,28 @@ static void __init memory_map_bottom_up(unsigned long map_start, } } +/* + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. + */ +static void __init init_trampoline(void) +{ +#ifdef CONFIG_X86_64 + if (!kaslr_memory_enabled()) + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; + else + init_trampoline_kaslr(); +#endif +} + void __init init_mem_mapping(void) { unsigned long end; @@ -949,7 +1004,7 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { @@ -957,7 +1012,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4222a010057a..bda909e3e37e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include <asm/bios_ebda.h> #include <asm/processor.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/dma.h> #include <asm/fixmap.h> #include <asm/e820/api.h> @@ -396,15 +395,6 @@ repeat: pte_t *kmap_pte; -static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) -{ - pgd_t *pgd = pgd_offset_k(vaddr); - p4d_t *p4d = p4d_offset(pgd, vaddr); - pud_t *pud = pud_offset(p4d, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - return pte_offset_kernel(pmd, vaddr); -} - static void __init kmap_init(void) { unsigned long kmap_vstart; @@ -413,28 +403,17 @@ static void __init kmap_init(void) * Cache the first kmap pte: */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + kmap_pte = virt_to_kpte(kmap_vstart); } #ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { - unsigned long vaddr; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + unsigned long vaddr = PKMAP_BASE; - vaddr = PKMAP_BASE; page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - pgd = swapper_pg_dir + pgd_index(vaddr); - p4d = p4d_offset(pgd, vaddr); - pud = pud_offset(p4d, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = virt_to_kpte(vaddr); } void __init add_highpages_with_active_regions(int nid, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b289c2f75cd..dbae185511cd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -37,7 +37,6 @@ #include <asm/processor.h> #include <asm/bios_ebda.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/dma.h> #include <asm/fixmap.h> @@ -54,6 +53,7 @@ #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> +#include <asm/ftrace.h> #include "mm_internal.h" @@ -217,6 +217,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -298,7 +303,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) @@ -367,7 +372,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, pgprot_t prot; pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | - pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); + protval_4k_2_large(cachemode2protval(cache)); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -1259,6 +1264,18 @@ void __init mem_init(void) mem_init_print_info(NULL); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + int kernel_set_to_readonly; void mark_rodata_ro(void) @@ -1291,6 +1308,8 @@ void mark_rodata_ro(void) all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + set_ftrace_ops_ro(); + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 18c637c0dc6f..84d85dbd1dad 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -16,12 +16,12 @@ #include <linux/mmiotrace.h> #include <linux/mem_encrypt.h> #include <linux/efi.h> +#include <linux/pgtable.h> #include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/efi.h> #include <asm/fixmap.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/memtype.h> @@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr, void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); } @@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr, void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -889,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx, set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else pte_clear(&init_mm, addr, pte); - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 763e71abc0fe..1a50434c8a4d 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -17,7 +17,6 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/pgtable.h> #include <asm/cpu_entry_area.h> extern struct range pfn_mapped[E820_MAX_ENTRIES]; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index dc6182eecefa..fb620fd9dae9 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -24,9 +24,9 @@ #include <linux/init.h> #include <linux/random.h> #include <linux/memblock.h> +#include <linux/pgtable.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/kaslr.h> @@ -61,15 +61,6 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region) return (region->size_tb << TB_SHIFT); } -/* - * Apply no randomization if KASLR was disabled at boot or if KASAN - * is enabled. KASAN shadow mappings rely on regions being PGD aligned. - */ -static inline bool kaslr_memory_enabled(void) -{ - return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); -} - /* Initialize base and padding for each memory region randomized with KASLR */ void __init kernel_randomize_memory(void) { @@ -148,7 +139,7 @@ void __init kernel_randomize_memory(void) } } -static void __meminit init_trampoline_pud(void) +void __meminit init_trampoline_kaslr(void) { pud_t *pud_page_tramp, *pud, *pud_tramp; p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; @@ -189,25 +180,3 @@ static void __meminit init_trampoline_pud(void) __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); } } - -/* - * The real mode trampoline, which is required for bootstrapping CPUs - * occupies only a small area under the low 1MB. See reserve_real_mode() - * for details. - * - * If KASLR is disabled the first PGD entry of the direct mapping is copied - * to map the real mode trampoline. - * - * If KASLR is enabled, copy only the PUD which covers the low 1MB - * area. This limits the randomization granularity to 1GB for both 4-level - * and 5-level paging. - */ -void __meminit init_trampoline(void) -{ - if (!kaslr_memory_enabled()) { - init_trampoline_default(); - return; - } - - init_trampoline_pud(); -} diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 9994353fb75d..be020a7bc414 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) pmdval_t v = pmd_val(*pmd); if (clear) { *old = v; - new_pmd = pmd_mknotpresent(*pmd); + new_pmd = pmd_mkinvalid(*pmd); } else { /* Presume this has been called with clear==true previously */ new_pmd = __pmd(*old); @@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one_kernel(f->addr); + flush_tlb_one_kernel(f->addr); return 0; } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index f5b85bdc0535..e1d7d7477c22 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -9,35 +9,21 @@ static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } -static __always_inline bool invalid_probe_range(u64 vaddr) +bool probe_kernel_read_allowed(const void *unsafe_src, size_t size) { + unsigned long vaddr = (unsigned long)unsafe_src; + /* * Range covering the highest possible canonical userspace address * as well as non-canonical address range. For the canonical range * we also need to include the userspace guard page. */ - return vaddr < TASK_SIZE_MAX + PAGE_SIZE || - canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; + return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr; } #else -static __always_inline bool invalid_probe_range(u64 vaddr) +bool probe_kernel_read_allowed(const void *unsafe_src, size_t size) { - return vaddr < TASK_SIZE_MAX; + return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } #endif - -long probe_kernel_read_strict(void *dst, const void *src, size_t size) -{ - if (unlikely(invalid_probe_range((unsigned long)src))) - return -EFAULT; - - return __probe_kernel_read(dst, src, size); -} - -long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) -{ - if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) - return -EFAULT; - - return __strncpy_from_unsafe(dst, unsafe_addr, count); -} diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index a03614bd3e1a..4a781cf99e92 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; } while (size); - __native_flush_tlb(); + flush_tlb_local(); } void __init sme_unmap_bootdata(char *real_mode_data) diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 106ead05bbe3..7a84fc8bc5c3 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -8,7 +8,7 @@ */ #include <linux/linkage.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/page.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index cb91eccc4960..c90c20904a60 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -18,7 +18,9 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/compat.h> +#include <linux/elf-randomize.h> #include <asm/elf.h> +#include <asm/io.h> #include "physaddr.h" diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 109325d77b3e..bd7aff5c51f7 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -17,8 +17,8 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/io.h> -#include <asm/pgtable.h> #include <linux/mmiotrace.h> +#include <linux/pgtable.h> #include <asm/e820/api.h> /* for ISA_START_ADDRESS */ #include <linux/atomic.h> #include <linux/percpu.h> @@ -372,7 +372,7 @@ static void enter_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL && + if (!cpumask_available(downed_cpus) && !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { pr_notice("Failed to allocate mask\n"); goto out; @@ -402,7 +402,7 @@ static void leave_uniprocessor(void) int cpu; int err; - if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) return; pr_notice("Re-enabling CPUs...\n"); for_each_cpu(cpu, downed_cpus) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 59ba008504dc..8ee952038c80 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void) * reserve specific pages for Sandy Bridge graphics. ] */ for_each_memblock(reserved, mb_region) { - if (mb_region->nid != MAX_NUMNODES) - node_set(mb_region->nid, reserved_nodemask); + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); } /* @@ -735,12 +737,9 @@ void __init x86_numa_init(void) static void __init init_memory_less_node(int nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - /* Allocate and initialize node data. Memory-less node is now online.*/ alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); + free_area_init_memoryless_node(nid); /* * All zonelists will be built later in start_kernel() after per cpu diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f2bd3d61e16b..104544359d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -27,40 +27,6 @@ #include "numa_internal.h" -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - start = round_down(start, PAGES_PER_SECTION); - end = round_up(end, PAGES_PER_SECTION); - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - physnode_map[pfn / PAGES_PER_SECTION] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} -#endif - extern unsigned long highend_pfn, highstart_pfn; void __init initmem_init(void) diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index facce271e8b9..0612a73638a8 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -14,7 +14,6 @@ #include <linux/vmalloc.h> #include <asm/cacheflush.h> -#include <asm/pgtable.h> #include <asm/kdebug.h> /* diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 394be8611748..8f665c352bf0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -46,7 +46,6 @@ #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> -#include <asm/pgtable.h> #include <asm/fcntl.h> #include <asm/e820/api.h> #include <asm/mtrr.h> diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c index a07e4882bf36..645613d59942 100644 --- a/arch/x86/mm/pat/memtype_interval.c +++ b/arch/x86/mm/pat/memtype_interval.c @@ -14,8 +14,8 @@ #include <linux/interval_tree_generic.h> #include <linux/sched.h> #include <linux/gfp.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/memtype.h> #include "memtype.h" diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 59eca6a94ce7..77e04304a2a7 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -43,7 +43,8 @@ struct cpa_data { unsigned long pfn; unsigned int flags; unsigned int force_split : 1, - force_static_prot : 1; + force_static_prot : 1, + force_flush_all : 1; struct page **pages; }; @@ -68,6 +69,11 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -340,7 +346,7 @@ static void __cpa_flush_tlb(void *data) unsigned int i; for (i = 0; i < cpa->numpages; i++) - __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); + flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) @@ -355,10 +361,10 @@ static void cpa_flush(struct cpa_data *data, int cache) return; } - if (cpa->numpages <= tlb_single_page_flush_ceiling) - on_each_cpu(__cpa_flush_tlb, cpa, 1); - else + if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); + else + on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; @@ -1598,6 +1604,8 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; + ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; @@ -1618,6 +1626,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd2c3a52297..dfd82f51ba66 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -3,7 +3,6 @@ #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> @@ -19,6 +18,14 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif +#ifndef CONFIG_PARAVIRT +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} +#endif + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -706,11 +713,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) if (pud_present(*pud) && !pud_huge(*pud)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } @@ -738,11 +743,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) if (pmd_present(*pmd) && !pmd_huge(*pmd)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0e6700eaa4f9..1953685c2ddf 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -11,7 +11,6 @@ #include <linux/spinlock.h> #include <asm/cpu_entry_area.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> #include <asm/e820/api.h> @@ -64,7 +63,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } unsigned long __FIXADDR_TOP = 0xfffff000; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..a8a924b3c335 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -34,7 +34,6 @@ #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -448,13 +447,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); @@ -499,12 +492,12 @@ static void __init pti_setup_espfix64(void) } /* - * Clone the populated PMDs of the entry and irqentry text and force it RO. + * Clone the populated PMDs of the entry text and force it RO. */ static void pti_clone_entry_text(void) { pti_clone_pgtable((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, + (unsigned long) __entry_text_end, PTI_CLONE_PMD); } diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index adb3c5784dac..ed5667f5169f 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -2,8 +2,8 @@ #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/init.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/cpufeature.h> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..1a3569b43aa5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -18,6 +18,16 @@ #include "mm_internal.h" +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info) +#endif + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -39,6 +49,126 @@ #define LAST_USER_MM_IBPB 0x1UL /* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif + return ret; +} + +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + return __sme_pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); + } +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; +} + +/* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the * necessary invalidation by clearing out the 'ctx_id' which @@ -110,6 +240,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { unsigned long new_mm_cr3; @@ -161,34 +317,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -272,6 +400,26 @@ static void cond_ibpb(struct task_struct *next) } } +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) + cr4_set_bits_irqsoff(X86_CR4_PCE); + else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); +} + +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -377,15 +525,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_ibpb(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, @@ -440,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4_irqsoff(next); + cr4_update_pce_mm(next); switch_ldt(real_prev, next); } } @@ -617,7 +756,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, unsigned long addr = f->start; while (addr < f->end) { - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); addr += 1UL << f->stride_shift; } if (local) @@ -625,7 +764,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ - local_flush_tlb(); + flush_tlb_local(); if (local) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); trace_tlb_flush(reason, TLB_FLUSH_ALL); @@ -660,8 +799,8 @@ static bool tlb_is_not_lazy(int cpu, void *data) return !per_cpu(cpu_tlbstate.is_lazy, cpu); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) @@ -711,6 +850,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1, cpumask); } +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_others(cpumask, info); +} + /* * See Documentation/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -821,7 +966,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -844,6 +989,164 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) } /* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); + +/* + * Flush one page in the kernel mapping + */ +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long cr4, flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); + + raw_local_irq_restore(flags); +} + +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); + } +} +EXPORT_SYMBOL_GPL(__flush_tlb_all); + +/* * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. * This means that the 'struct flush_tlb_info' that describes which mappings to * flush is actually fixed. We therefore set a single fixed struct and use it in @@ -874,6 +1177,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) put_cpu(); } +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5ea7c2cf7ab4..42b6709e6dc7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -158,6 +158,19 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_AX)); } +/* + * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64 + * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte + * of encoding. al,cl,dl,bl have simpler encoding. + */ +static bool is_ereg_8l(u32 reg) +{ + return is_ereg(reg) || + (1 << reg) & (BIT(BPF_REG_1) | + BIT(BPF_REG_2) | + BIT(BPF_REG_FP)); +} + static bool is_axreg(u32 reg) { return reg == BPF_REG_0; @@ -598,9 +611,8 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) switch (size) { case BPF_B: /* Emit 'mov byte ptr [rax + off], al' */ - if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* We have to add extra byte for x86 SIL, DIL regs */ - src_reg == BPF_REG_1 || src_reg == BPF_REG_2) + if (is_ereg(dst_reg) || is_ereg_8l(src_reg)) + /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); else EMIT1(0x88); diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 4d2a7a764602..96fde03aa987 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1475,8 +1475,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true; - const bool sstk = insn->src_reg == BPF_REG_AX ? false : true; + const bool dstk = insn->dst_reg != BPF_REG_AX; + const bool sstk = insn->src_reg != BPF_REG_AX; const u8 code = insn->code; const u8 *dst = bpf2ia32[insn->dst_reg]; const u8 *src = bpf2ia32[insn->src_reg]; @@ -1847,14 +1847,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_B: case BPF_H: case BPF_W: - if (!bpf_prog->aux->verifier_zext) + if (bpf_prog->aux->verifier_zext) break; if (dstk) { EMIT3(0xC7, add_1reg(0x40, IA32_EBP), STACK_VAR(dst_hi)); EMIT(0x0, 4); } else { - EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0); + /* xor dst_hi,dst_hi */ + EMIT2(0x33, + add_2reg(0xC0, dst_hi, dst_hi)); } break; case BPF_DW: @@ -2013,8 +2015,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: { bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP; - u8 dreg_lo = dstk ? IA32_EAX : dst_lo; - u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 dreg_lo = IA32_EAX; + u8 dreg_hi = IA32_EDX; u8 sreg_lo = sstk ? IA32_ECX : src_lo; u8 sreg_hi = sstk ? IA32_EBX : src_hi; @@ -2026,6 +2028,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst_hi)); + } else { + /* mov dreg_lo,dst_lo */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo)); + if (is_jmp64) + /* mov dreg_hi,dst_hi */ + EMIT2(0x89, + add_2reg(0xC0, dreg_hi, dst_hi)); } if (sstk) { @@ -2050,8 +2059,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: { bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP; - u8 dreg_lo = dstk ? IA32_EAX : dst_lo; - u8 dreg_hi = dstk ? IA32_EDX : dst_hi; + u8 dreg_lo = IA32_EAX; + u8 dreg_hi = IA32_EDX; u8 sreg_lo = IA32_ECX; u8 sreg_hi = IA32_EBX; u32 hi; @@ -2064,6 +2073,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst_hi)); + } else { + /* mov dreg_lo,dst_lo */ + EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo)); + if (is_jmp64) + /* mov dreg_hi,dst_hi */ + EMIT2(0x89, + add_2reg(0xC0, dreg_hi, dst_hi)); } /* mov ecx,imm32 */ diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e723559c386a..0c67a5a94de3 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -572,6 +572,10 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); /* * Device [1022:7808] diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 91220cc25854..e3f1ca316068 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -60,8 +60,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) } #ifdef CONFIG_ACPI -static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, - bool set_pirq) +static int xen_register_pirq(u32 gsi, int triggering, bool set_pirq) { int rc, pirq = -1, irq = -1; struct physdev_map_pirq map_irq; @@ -94,9 +93,6 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, name = "ioapic-level"; } - if (gsi_override >= 0) - gsi = gsi_override; - irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); if (irq < 0) goto out; @@ -112,12 +108,12 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, if (!xen_hvm_domain()) return -1; - return xen_register_pirq(gsi, -1 /* no GSI override */, trigger, + return xen_register_pirq(gsi, trigger, false /* no mapping of GSI to PIRQ */); } #ifdef CONFIG_XEN_DOM0 -static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity) +static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; struct physdev_setup_gsi setup_gsi; @@ -128,7 +124,7 @@ static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polar printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", gsi, triggering, polarity); - irq = xen_register_pirq(gsi, gsi_override, triggering, true); + irq = xen_register_pirq(gsi, triggering, true); setup_gsi.gsi = gsi; setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); @@ -148,7 +144,7 @@ static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polar static int acpi_register_gsi_xen(struct device *dev, u32 gsi, int trigger, int polarity) { - return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity); + return xen_register_gsi(gsi, trigger, polarity); } #endif #endif @@ -491,7 +487,7 @@ int __init pci_xen_initial_domain(void) if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) continue; - xen_register_pirq(irq, -1 /* no GSI override */, + xen_register_pirq(irq, trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, true /* Map GSI to PIRQ */); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1aae5302501d..e966115d105c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -62,12 +62,12 @@ static unsigned long efi_runtime, efi_nr_tables; unsigned long efi_fw_vendor, efi_config_table; static const efi_config_table_type_t arch_tables[] __initconst = { - {EFI_PROPERTIES_TABLE_GUID, "PROP", &prop_phys}, - {UGA_IO_PROTOCOL_GUID, "UGA", &uga_phys}, + {EFI_PROPERTIES_TABLE_GUID, &prop_phys, "PROP" }, + {UGA_IO_PROTOCOL_GUID, &uga_phys, "UGA" }, #ifdef CONFIG_X86_UV - {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, + {UV_SYSTEM_TABLE_GUID, &uv_systab_phys, "UVsystab" }, #endif - {NULL_GUID, NULL, NULL}, + {}, }; static const unsigned long * const efi_tables[] = { diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index c049c432745d..826ead67753d 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -24,11 +24,11 @@ #include <linux/types.h> #include <linux/ioport.h> #include <linux/efi.h> +#include <linux/pgtable.h> #include <asm/io.h> #include <asm/desc.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/efi.h> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 211bb9358b73..8e364c4c6768 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -39,7 +39,6 @@ #include <asm/setup.h> #include <asm/page.h> #include <asm/e820/api.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/efi.h> @@ -202,7 +201,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - unsigned long pfn, text, pf; + unsigned long pfn, text, pf, rodata; struct page *page; unsigned npages; pgd_t *pgd = efi_mm.pgd; @@ -256,7 +255,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */ - npages = (__end_rodata_aligned - _text) >> PAGE_SHIFT; + npages = (_etext - _text) >> PAGE_SHIFT; text = __pa(_text); pfn = text >> PAGE_SHIFT; @@ -266,6 +265,14 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } + npages = (__end_rodata - __start_rodata) >> PAGE_SHIFT; + rodata = __pa(__start_rodata); + pfn = rodata >> PAGE_SHIFT; + if (kernel_map_pages_in_pgd(pgd, pfn, rodata, npages, pf)) { + pr_err("Failed to map kernel rodata 1:1\n"); + return 1; + } + return 0; } @@ -638,7 +645,7 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, phys_vendor = virt_to_phys_or_null(vnd); phys_data = virt_to_phys_or_null_size(data, data_size); - if (!phys_name || !phys_data) + if (!phys_name || (data && !phys_data)) status = EFI_INVALID_PARAMETER; else status = efi_thunk(set_variable, phys_name, phys_vendor, @@ -669,7 +676,7 @@ efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, phys_vendor = virt_to_phys_or_null(vnd); phys_data = virt_to_phys_or_null_size(data, data_size); - if (!phys_name || !phys_data) + if (!phys_name || (data && !phys_data)) status = EFI_INVALID_PARAMETER; else status = efi_thunk(set_variable, phys_name, phys_vendor, diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 15da118f04f0..90380a17ab23 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -21,7 +21,7 @@ SYM_FUNC_START(__efi_call) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx - CALL_NOSPEC %rdi + CALL_NOSPEC rdi leave ret SYM_FUNC_END(__efi_call) diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 933dd4fe3a97..f03a6883dcc6 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -52,7 +52,7 @@ static const char * const lid_wake_mode_names[] = { static void battery_status_changed(void) { - struct power_supply *psy = power_supply_get_by_name("olpc-battery"); + struct power_supply *psy = power_supply_get_by_name("olpc_battery"); if (psy) { power_supply_changed(psy); @@ -62,7 +62,7 @@ static void battery_status_changed(void) static void ac_status_changed(void) { - struct power_supply *psy = power_supply_get_by_name("olpc-ac"); + struct power_supply *psy = power_supply_get_by_name("olpc_ac"); if (psy) { power_supply_changed(psy); diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 089413cd944e..85f4638764d6 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -75,7 +75,7 @@ static struct kobj_attribute lid_wake_on_close_attr = static void battery_status_changed(void) { - struct power_supply *psy = power_supply_get_by_name("olpc-battery"); + struct power_supply *psy = power_supply_get_by_name("olpc_battery"); if (psy) { power_supply_changed(psy); @@ -85,7 +85,7 @@ static void battery_status_changed(void) static void ac_status_changed(void) { - struct power_supply *psy = power_supply_get_by_name("olpc-ac"); + struct power_supply *psy = power_supply_get_by_name("olpc_ac"); if (psy) { power_supply_changed(psy); diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c index 20a064568463..6bab0f0aa8f3 100644 --- a/arch/x86/platform/olpc/olpc_ofw.c +++ b/arch/x86/platform/olpc/olpc_ofw.c @@ -3,12 +3,12 @@ #include <linux/export.h> #include <linux/spinlock_types.h> #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/page.h> #include <asm/setup.h> #include <asm/io.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> -#include <asm/pgtable.h> #include <asm/olpc_ofw.h> /* address of OFW callback interface; will be NULL if OFW isn't found */ diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index c60255da5a6c..4494589a288a 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -45,7 +45,8 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +static s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, + u64 a5) { s64 ret; @@ -57,10 +58,9 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) return ret; } -EXPORT_SYMBOL_GPL(uv_bios_call); -s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) +static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { unsigned long bios_flags; s64 ret; @@ -77,18 +77,13 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } - long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); long sn_coherency_id; -EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); long system_serial_number; -EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; -EXPORT_SYMBOL_GPL(uv_type); - s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, long *region, long *ssn) @@ -115,7 +110,6 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *ssn = v1; return ret; } -EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -166,7 +160,6 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, (u64)ticks_per_second, 0, 0, 0); } -EXPORT_SYMBOL_GPL(uv_bios_freq_base); /* * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target @@ -185,7 +178,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, (u64)decode, (u64)domain, (u64)bus, 0, 0); } -EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); int uv_bios_init(void) { diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 1fd321f37f1b..0ac96ca304c7 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -293,10 +293,10 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * This must be a normal message, or retry of a normal message */ if (msg->address == TLB_FLUSH_ALL) { - local_flush_tlb(); + flush_tlb_local(); stat->d_alltlb++; } else { - __flush_tlb_one_user(msg->address); + flush_tlb_one_user(msg->address); stat->d_onetlb++; } stat->d_requestee++; @@ -1272,7 +1272,7 @@ static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * (the resource will not be freed until noninterruptable cpus see this * interrupt; hardware may timeout the s/w ack and reply ERROR) */ -void uv_bau_message_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_uv_bau_message) { int count = 0; cycles_t time_start; diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 62214731fea5..266773e2fb37 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -21,7 +21,7 @@ static ssize_t partition_id_show(struct kobject *kobj, static ssize_t coherence_id_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id()); + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id); } static struct kobj_attribute partition_id_attr = diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index aaff9ed7ff45..7c65102debaf 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -13,8 +13,8 @@ #include <linux/perf_event.h> #include <linux/tboot.h> #include <linux/dmi.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/mtrr.h> #include <asm/page.h> @@ -307,7 +307,7 @@ int hibernate_resume_nonboot_cpu_disable(void) if (ret) return ret; smp_ops.play_dead = resume_play_dead; - ret = disable_nonboot_cpus(); + ret = freeze_secondary_cpus(0); smp_ops.play_dead = play_dead; return ret; } diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index fc413717a45f..d147f1b2c925 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -12,6 +12,7 @@ #include <linux/scatterlist.h> #include <linux/kdebug.h> #include <linux/cpu.h> +#include <linux/pgtable.h> #include <crypto/hash.h> @@ -19,7 +20,6 @@ #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index a1061d471b73..223d5bca29b8 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -8,9 +8,9 @@ #include <linux/gfp.h> #include <linux/suspend.h> #include <linux/memblock.h> +#include <linux/pgtable.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/mmzone.h> #include <asm/sections.h> #include <asm/suspend.h> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0197095d9637..a595953f1d6d 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -12,6 +12,7 @@ #include <linux/suspend.h> #include <linux/scatterlist.h> #include <linux/kdebug.h> +#include <linux/pgtable.h> #include <crypto/hash.h> @@ -19,7 +20,6 @@ #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 262f83cad355..1ed1208931e0 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -3,9 +3,9 @@ #include <linux/slab.h> #include <linux/memblock.h> #include <linux/mem_encrypt.h> +#include <linux/pgtable.h> #include <asm/set_memory.h> -#include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> #include <asm/crash.h> diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index 2a56cac64687..ff6bba2c8ab6 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -36,26 +36,6 @@ __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial(dst, len, sum); } -/* - * the same as csum_partial, but copies from src while it - * checksums, and handles user-space pointer exceptions correctly, when needed. - * - * here even more important to align src and dst on a 32-bit (or even - * better 64-bit) boundary - */ - -static __inline__ -__wsum csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum sum, int *err_ptr) -{ - if (copy_from_user(dst, src, len)) { - *err_ptr = -EFAULT; - return (__force __wsum)-1; - } - - return csum_partial(dst, len, sum); -} - /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 9e7c4aba6c3a..76d9f6ce7a3d 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -58,7 +58,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso_enabled) return 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, @@ -66,7 +66,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdsop); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return err; } diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 1abe455d926a..205a9bc981b0 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -29,7 +29,7 @@ static efi_system_table_t efi_systab_xen __initdata = { .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ .fw_revision = 0, /* Initialized later. */ .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ - .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = NULL, /* Not used under Xen. */ .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ .con_out = NULL, /* Not used under Xen. */ .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index e138f7de52d2..3e89b0067ff0 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -13,6 +13,7 @@ #include <asm/smp.h> #include <asm/reboot.h> #include <asm/setup.h> +#include <asm/idtentry.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> #include <asm/early_ioremap.h> @@ -118,6 +119,17 @@ static void __init init_hvm_pv_info(void) this_cpu_write(xen_vcpu_id, smp_processor_id()); } +DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_hvm_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + #ifdef CONFIG_KEXEC_CORE static void xen_hvm_shutdown(void) { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 507f4fb88fa7..33b309d65955 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -63,7 +63,6 @@ #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> #include <asm/stackprotector.h> @@ -605,32 +604,42 @@ struct trap_array_entry { bool ist_okay; }; +#define TRAP_ENTRY(func, ist_ok) { \ + .orig = asm_##func, \ + .xen = xen_asm_##func, \ + .ist_okay = ist_ok } + +#define TRAP_ENTRY_REDIR(func, xenfunc, ist_ok) { \ + .orig = asm_##func, \ + .xen = xen_asm_##xenfunc, \ + .ist_okay = ist_ok } + static struct trap_array_entry trap_array[] = { - { debug, xen_xendebug, true }, - { double_fault, xen_double_fault, true }, + TRAP_ENTRY_REDIR(exc_debug, exc_xendebug, true ), + TRAP_ENTRY(exc_double_fault, true ), #ifdef CONFIG_X86_MCE - { machine_check, xen_machine_check, true }, + TRAP_ENTRY(exc_machine_check, true ), #endif - { nmi, xen_xennmi, true }, - { int3, xen_int3, false }, - { overflow, xen_overflow, false }, + TRAP_ENTRY_REDIR(exc_nmi, exc_xennmi, true ), + TRAP_ENTRY(exc_int3, false ), + TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION { entry_INT80_compat, xen_entry_INT80_compat, false }, #endif - { page_fault, xen_page_fault, false }, - { divide_error, xen_divide_error, false }, - { bounds, xen_bounds, false }, - { invalid_op, xen_invalid_op, false }, - { device_not_available, xen_device_not_available, false }, - { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, - { invalid_TSS, xen_invalid_TSS, false }, - { segment_not_present, xen_segment_not_present, false }, - { stack_segment, xen_stack_segment, false }, - { general_protection, xen_general_protection, false }, - { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, - { coprocessor_error, xen_coprocessor_error, false }, - { alignment_check, xen_alignment_check, false }, - { simd_coprocessor_error, xen_simd_coprocessor_error, false }, + TRAP_ENTRY(exc_page_fault, false ), + TRAP_ENTRY(exc_divide_error, false ), + TRAP_ENTRY(exc_bounds, false ), + TRAP_ENTRY(exc_invalid_op, false ), + TRAP_ENTRY(exc_device_not_available, false ), + TRAP_ENTRY(exc_coproc_segment_overrun, false ), + TRAP_ENTRY(exc_invalid_tss, false ), + TRAP_ENTRY(exc_segment_not_present, false ), + TRAP_ENTRY(exc_stack_segment, false ), + TRAP_ENTRY(exc_general_protection, false ), + TRAP_ENTRY(exc_spurious_interrupt_bug, false ), + TRAP_ENTRY(exc_coprocessor_error, false ), + TRAP_ENTRY(exc_alignment_check, false ), + TRAP_ENTRY(exc_simd_coprocessor_error, false ), }; static bool __ref get_trap_addr(void **addr, unsigned int ist) @@ -642,7 +651,7 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist) * Replace trap handler addresses by Xen specific ones. * Check for known traps using IST and whitelist them. * The debugger ones are the only ones we care about. - * Xen will handle faults like double_fault, * so we should never see + * Xen will handle faults like double_fault, so we should never see * them. Warn if there's an unexpected IST-using fault handler. */ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index ecb0d5450334..4988e19598c8 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -21,7 +21,6 @@ #include <xen/grant_table.h> #include <xen/xen.h> -#include <asm/pgtable.h> static struct gnttab_vm_area { struct vm_struct *area; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index bbba8b17829a..a58d9c69807a 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -51,13 +51,13 @@ #include <linux/memblock.h> #include <linux/seq_file.h> #include <linux/crash_dump.h> +#include <linux/pgtable.h> #ifdef CONFIG_KEXEC_CORE #include <linux/kexec.h> #endif #include <trace/events/xen.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> #include <asm/mmu_context.h> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a2d8a50dac4..3566e37241d7 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include <asm/setup.h> #include <asm/acpi.h> #include <asm/numa.h> +#include <asm/idtentry.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -993,7 +994,8 @@ static void __init xen_pvmmu_arch_setup(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + if (register_callback(CALLBACKTYPE_event, + xen_asm_exc_xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) BUG(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 8fb8a50a28b4..171aff1b11f2 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -23,10 +23,11 @@ #include <linux/nmi.h> #include <linux/cpuhotplug.h> #include <linux/stackprotector.h> +#include <linux/pgtable.h> #include <asm/paravirt.h> +#include <asm/idtentry.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <asm/cpu.h> #include <xen/interface/xen.h> @@ -93,6 +94,7 @@ asmlinkage __visible void cpu_bringup_and_idle(void) cpu_bringup(); boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + prevent_tail_call_optimization(); } void xen_smp_intr_free_pv(unsigned int cpu) @@ -347,7 +349,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; + (unsigned long)xen_asm_exc_xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index e666b614cf6d..9d548b0c772f 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -2,6 +2,7 @@ #include <linux/types.h> #include <xen/xen.h> +#include <xen/hvm.h> #include <xen/features.h> #include <xen/interface/features.h> @@ -13,6 +14,6 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_hvm_init_shared_info(); xen_vcpu_restore(); } - xen_callback_vector(); + xen_setup_callback_vector(); xen_unplug_emulated_devices(); } diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2712e9155306..4757cec33abe 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -93,7 +93,7 @@ xen_iret_start_crit: /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback. Otherwise do not + * jump back into exc_xen_hypervisor_callback. Otherwise do not * touch XEN_vcpu_info_mask. */ jne 1f @@ -113,11 +113,11 @@ iret_restore_end: * Events are masked, so jumping out of the critical region is * OK. */ - je xen_hypervisor_callback + je xen_asm_exc_xen_hypervisor_callback 1: iret xen_iret_end_crit: - _ASM_EXTABLE(1b, iret_exc) + _ASM_EXTABLE(1b, asm_iret_error) hyper_iret: /* put this out of line since its very rarely used */ @@ -127,7 +127,7 @@ SYM_CODE_END(xen_iret) .globl xen_iret_start_crit, xen_iret_end_crit /* - * This is called by xen_hypervisor_callback in entry_32.S when it sees + * This is called by xen_asm_exc_xen_hypervisor_callback in entry_32.S when it sees * that the EIP at the time of interrupt was between * xen_iret_start_crit and xen_iret_end_crit. * @@ -144,7 +144,7 @@ SYM_CODE_END(xen_iret) * eflags } * cs } nested exception info * eip } - * return address : (into xen_hypervisor_callback) + * return address : (into xen_asm_exc_xen_hypervisor_callback) * * In order to deliver the nested exception properly, we need to discard the * nested exception frame such that when we handle the exception, we do it @@ -152,7 +152,8 @@ SYM_CODE_END(xen_iret) * * The only caveat is that if the outer eax hasn't been restored yet (i.e. * it's still on stack), we need to restore its value here. - */ +*/ +.pushsection .noinstr.text, "ax" SYM_CODE_START(xen_iret_crit_fixup) /* * Paranoia: Make sure we're really coming from kernel space. @@ -181,3 +182,4 @@ SYM_CODE_START(xen_iret_crit_fixup) 2: ret SYM_CODE_END(xen_iret_crit_fixup) +.popsection diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 0a0fd168683a..5d252aaeade8 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -28,33 +28,33 @@ SYM_CODE_END(xen_\name) _ASM_NOKPROBE(xen_\name) .endm -xen_pv_trap divide_error -xen_pv_trap debug -xen_pv_trap xendebug -xen_pv_trap int3 -xen_pv_trap xennmi -xen_pv_trap overflow -xen_pv_trap bounds -xen_pv_trap invalid_op -xen_pv_trap device_not_available -xen_pv_trap double_fault -xen_pv_trap coprocessor_segment_overrun -xen_pv_trap invalid_TSS -xen_pv_trap segment_not_present -xen_pv_trap stack_segment -xen_pv_trap general_protection -xen_pv_trap page_fault -xen_pv_trap spurious_interrupt_bug -xen_pv_trap coprocessor_error -xen_pv_trap alignment_check +xen_pv_trap asm_exc_divide_error +xen_pv_trap asm_exc_debug +xen_pv_trap asm_exc_xendebug +xen_pv_trap asm_exc_int3 +xen_pv_trap asm_exc_xennmi +xen_pv_trap asm_exc_overflow +xen_pv_trap asm_exc_bounds +xen_pv_trap asm_exc_invalid_op +xen_pv_trap asm_exc_device_not_available +xen_pv_trap asm_exc_double_fault +xen_pv_trap asm_exc_coproc_segment_overrun +xen_pv_trap asm_exc_invalid_tss +xen_pv_trap asm_exc_segment_not_present +xen_pv_trap asm_exc_stack_segment +xen_pv_trap asm_exc_general_protection +xen_pv_trap asm_exc_page_fault +xen_pv_trap asm_exc_spurious_interrupt_bug +xen_pv_trap asm_exc_coprocessor_error +xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE -xen_pv_trap machine_check +xen_pv_trap asm_exc_machine_check #endif /* CONFIG_X86_MCE */ -xen_pv_trap simd_coprocessor_error +xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION xen_pv_trap entry_INT80_compat #endif -xen_pv_trap hypervisor_callback +xen_pv_trap asm_exc_xen_hypervisor_callback __INIT SYM_CODE_START(xen_early_idt_handler_array) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 45a441c33d6d..53b224fd6177 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -8,7 +8,6 @@ #include <xen/xen-ops.h> /* These are code, but not functions. Defined in entry.S */ -extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; void xen_sysenter_target(void); @@ -55,7 +54,6 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); -void xen_callback_vector(void); void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); |