diff options
Diffstat (limited to 'arch/x86')
386 files changed, 11126 insertions, 9810 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b9f378e05f6..121f9f03bd5c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -153,6 +153,7 @@ config X86 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH + select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_WATCHDOG @@ -426,8 +427,7 @@ config DYNAMIC_PHYSICAL_MASK config PGTABLE_LEVELS int - default 5 if X86_5LEVEL - default 4 if X86_64 + default 5 if X86_64 default 3 if X86_PAE default 2 @@ -799,6 +799,7 @@ config PARAVIRT config PARAVIRT_XXL bool + depends on X86_64 config PARAVIRT_DEBUG bool "paravirt-ops debugging" @@ -1463,27 +1464,6 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. -config X86_5LEVEL - bool "Enable 5-level page tables support" - default y - select DYNAMIC_MEMORY_LAYOUT - select SPARSEMEM_VMEMMAP - depends on X86_64 - help - 5-level paging enables access to larger address space: - up to 128 PiB of virtual address space and 4 PiB of - physical address space. - - It will be supported by future Intel CPUs. - - A kernel with the option enabled can be booted on machines that - support 4- or 5-level paging. - - See Documentation/arch/x86/x86_64/5level-paging.rst for more - information. - - Say N if unsure. - config X86_DIRECT_GBPAGES def_bool y depends on X86_64 @@ -1579,6 +1559,7 @@ config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 + select SPARSEMEM_VMEMMAP if X86_64 config ARCH_SPARSEMEM_DEFAULT def_bool X86_64 || (NUMA && X86_32) @@ -2167,17 +2148,10 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. -config DYNAMIC_MEMORY_LAYOUT - bool - help - This option makes base addresses of vmalloc and vmemmap as well as - __PAGE_OFFSET movable during boot. - config RANDOMIZE_MEMORY bool "Randomize the kernel memory sections" depends on X86_64 depends on RANDOMIZE_BASE - select DYNAMIC_MEMORY_LAYOUT default RANDOMIZE_BASE help Randomizes the base virtual address of kernel memory sections @@ -2368,6 +2342,7 @@ config STRICT_SIGALTSTACK_SIZE config CFI_AUTO_DEFAULT bool "Attempt to use FineIBT by default at boot time" depends on FINEIBT + depends on !RUST || RUSTC_VERSION >= 108800 default y help Attempt to use FineIBT by default at boot time. If enabled, @@ -2710,6 +2685,18 @@ config MITIGATION_SSB of speculative execution in a similar way to the Meltdown and Spectre security vulnerabilities. +config MITIGATION_ITS + bool "Enable Indirect Target Selection mitigation" + depends on CPU_SUP_INTEL && X86_64 + depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK + select EXECMEM + default y + help + Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in + BPU on some Intel CPUs that may allow Spectre V2 style attacks. If + disabled, mitigation cannot be enabled via cmdline. + See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst> + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 6d20a6ce0507..c827f694fb72 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -6,20 +6,6 @@ config AS_AVX512 help Supported by binutils >= 2.25 and LLVM integrated assembler -config AS_SHA1_NI - def_bool $(as-instr,sha1msg1 %xmm0$(comma)%xmm1) - help - Supported by binutils >= 2.24 and LLVM integrated assembler - -config AS_SHA256_NI - def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) - help - Supported by binutils >= 2.24 and LLVM integrated assembler -config AS_TPAUSE - def_bool $(as-instr,tpause %ecx) - help - Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 - config AS_GFNI def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2) help diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 753b8763abae..f928cf6e3252 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -245,6 +245,30 @@ config MATOM endchoice +config CC_HAS_MARCH_NATIVE + # This flag might not be available in cross-compilers: + def_bool $(cc-option, -march=native) + # LLVM 18 has an easily triggered internal compiler error in core + # networking code with '-march=native' on certain systems: + # https://github.com/llvm/llvm-project/issues/72026 + # LLVM 19 introduces an optimization that resolves some high stack + # usage warnings that only appear wth '-march=native'. + depends on CC_IS_GCC || CLANG_VERSION >= 190100 + +config X86_NATIVE_CPU + bool "Build and optimize for local/native CPU" + depends on X86_64 + depends on CC_HAS_MARCH_NATIVE + help + Optimize for the current CPU used to compile the kernel. + Use this option if you intend to build the kernel for your + local machine. + + Note that such a kernel might not work optimally on a + different x86 machine. + + If unsure, say N. + config X86_GENERIC bool "Generic x86 support" depends on X86_32 diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures index e12d5b7e39a2..250c10627ab3 100644 --- a/arch/x86/Kconfig.cpufeatures +++ b/arch/x86/Kconfig.cpufeatures @@ -132,10 +132,6 @@ config X86_DISABLED_FEATURE_OSPKE def_bool y depends on !X86_INTEL_MEMORY_PROTECTION_KEYS -config X86_DISABLED_FEATURE_LA57 - def_bool y - depends on !X86_5LEVEL - config X86_DISABLED_FEATURE_PTI def_bool y depends on !MITIGATION_PAGE_TABLE_ISOLATION diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 594723005d95..1913d342969b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -173,8 +173,13 @@ else # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) +ifdef CONFIG_X86_NATIVE_CPU + KBUILD_CFLAGS += -march=native + KBUILD_RUSTFLAGS += -Ctarget-cpu=native +else KBUILD_CFLAGS += -march=x86-64 -mtune=generic KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic +endif KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel @@ -281,6 +286,7 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects +core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 81f55da81967..640fcac3af74 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -59,7 +59,7 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ - cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@ + cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index aa9b96457584..cf4a6155714e 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -32,7 +32,7 @@ intcall: movw %dx, %si movw %sp, %di movw $11, %cx - rep; movsl + rep movsl /* Pop full state from the stack */ popal @@ -67,7 +67,7 @@ intcall: jz 4f movw %sp, %si movw $11, %cx - rep; movsl + rep movsl 4: addw $44, %sp /* Restore state and return */ diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 38f17a1e1e36..60580836daf7 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -34,7 +34,7 @@ extern struct setup_header hdr; extern struct boot_params boot_params; -#define cpu_relax() asm volatile("rep; nop") +#define cpu_relax() asm volatile("pause") static inline void io_delay(void) { @@ -155,14 +155,14 @@ static inline void wrgs32(u32 v, addr_t addr) static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("fs; repe; cmpsb" CC_SET(nz) + asm volatile("fs repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("gs; repe; cmpsb" CC_SET(nz) + asm volatile("gs repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fdbce022db55..f4f7b22d8113 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -44,10 +44,10 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h -# sev.c indirectly includes inat-table.h which is generated during +# sev-decode-insn.c indirectly includes inat-table.c which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -96,8 +96,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o - vmlinux-objs-y += $(obj)/la57toggle.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o @@ -106,6 +105,7 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a $(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index eafd4f185e77..d9dab940ff62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -35,7 +35,6 @@ #include <asm/bootparam.h> #include <asm/desc_defs.h> #include <asm/trapnr.h> -#include "pgtable.h" /* * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c index dbba332e4a12..0e9f84ab4bdc 100644 --- a/arch/x86/boot/compressed/mem.c +++ b/arch/x86/boot/compressed/mem.c @@ -38,7 +38,7 @@ void arch_accept_memory(phys_addr_t start, phys_addr_t end) if (early_is_tdx_guest()) { if (!tdx_accept_memory(start, end)) panic("TDX: Failed to accept memory\n"); - } else if (sev_snp_enabled()) { + } else if (early_is_sevsnp_guest()) { snp_accept_memory(start, end); } else { error("Cannot accept memory: unknown platform\n"); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcd4aaf395..94b5991da001 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,7 +14,6 @@ #include "misc.h" #include "error.h" -#include "pgtable.h" #include "../string.h" #include "../voffset.h" #include <asm/bootparam_utils.h> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index dd8d1a85f671..db1048621ea2 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -136,6 +136,9 @@ static inline void console_init(void) #endif #ifdef CONFIG_AMD_MEM_ENCRYPT +struct es_em_ctxt; +struct insn; + void sev_enable(struct boot_params *bp); void snp_check_features(void); void sev_es_shutdown_ghcb(void); @@ -143,6 +146,11 @@ extern bool sev_es_check_ghcb_fault(unsigned long address); void snp_set_page_private(unsigned long paddr); void snp_set_page_shared(unsigned long paddr); void sev_prep_identity_maps(unsigned long top_level_pgt); + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt); +bool insn_has_rep_prefix(struct insn *insn); +void sev_insn_decode_init(void); +bool early_setup_ghcb(void); #else static inline void sev_enable(struct boot_params *bp) { diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h deleted file mode 100644 index 6d595abe06b3..000000000000 --- a/arch/x86/boot/compressed/pgtable.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef BOOT_COMPRESSED_PAGETABLE_H -#define BOOT_COMPRESSED_PAGETABLE_H - -#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) - -#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 - -#ifndef __ASSEMBLER__ - -extern unsigned long *trampoline_32bit; - -extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl); - -extern const u16 trampoline_ljmp_imm_offset; - -#endif /* __ASSEMBLER__ */ -#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index d8c5de40669d..bdd26050dff7 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -4,19 +4,16 @@ #include <asm/bootparam_utils.h> #include <asm/e820/types.h> #include <asm/processor.h> -#include "pgtable.h" #include "../string.h" #include "efi.h" #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ -#ifdef CONFIG_X86_5LEVEL /* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ unsigned int __section(".data") __pgtable_l5_enabled; unsigned int __section(".data") pgdir_shift = 39; unsigned int __section(".data") ptrs_per_p4d = 1; -#endif /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; @@ -115,18 +112,13 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) * Check if LA57 is desired and supported. * * There are several parts to the check: - * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y * - if user asked to disable 5-level paging: no5lvl in cmdline * - if the machine supports 5-level paging: * + CPUID leaf 7 is supported * + the leaf has the feature bit set - * - * That's substitute for boot_cpu_has() in early boot code. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL) && - !cmdline_find_option_bool("no5lvl") && - native_cpuid_eax(0) >= 7 && - (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { + if (!cmdline_find_option_bool("no5lvl") && + native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) { l5_required = true; /* Initialize variables for 5-level paging */ diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c new file mode 100644 index 000000000000..89dd02de2a0f --- /dev/null +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "sev.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <asm/insn.h> +#include <asm/pgtable_types.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/fpu/xcr.h> + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + int i; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, i, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +extern void sev_insn_decode_init(void) __alias(inat_init_tables); + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + +#define sev_printk(fmt, ...) + +#include "../../coco/sev/vc-shared.c" + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index bb55934c1cee..fd1b67dfea22 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -21,99 +21,14 @@ #include <asm/fpu/xcr.h> #include <asm/ptrace.h> #include <asm/svm.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "error.h" -#include "../msr.h" +#include "sev.h" static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; -/* - * Copy a version of this function here - insn-eval.c can't be used in - * pre-decompression code. - */ -static bool insn_has_rep_prefix(struct insn *insn) -{ - insn_byte_t p; - int i; - - insn_get_prefixes(insn); - - for_each_insn_prefix(insn, i, p) { - if (p == 0xf2 || p == 0xf3) - return true; - } - - return false; -} - -/* - * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and - * doesn't use segments. - */ -static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) -{ - return 0UL; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - struct msr m; - - boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); - - return m.q; -} - -static inline void sev_es_wr_ghcb_msr(u64 val) -{ - struct msr m; - - m.q = val; - boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int ret; - - memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - - return ES_OK; -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - void *dst, char *buf, size_t size) -{ - memcpy(dst, buf, size); - - return ES_OK; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - void *src, char *buf, size_t size) -{ - memcpy(buf, src, size); - - return ES_OK; -} - -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) -{ - return ES_OK; -} - -static bool fault_in_kernel_space(unsigned long address) -{ - return false; -} - #undef __init #define __init @@ -122,24 +37,27 @@ static bool fault_in_kernel_space(unsigned long address) #define __BOOT_COMPRESSED -/* Basic instruction decoding support needed */ -#include "../../lib/inat.c" -#include "../../lib/insn.c" - -/* Include code for early handlers */ -#include "../../coco/sev/shared.c" +extern struct svsm_ca *boot_svsm_caa; +extern u64 boot_svsm_caa_pa; -static struct svsm_ca *svsm_get_caa(void) +struct svsm_ca *svsm_get_caa(void) { return boot_svsm_caa; } -static u64 svsm_get_caa_pa(void) +u64 svsm_get_caa_pa(void) { return boot_svsm_caa_pa; } -static int svsm_perform_call_protocol(struct svsm_call *call) +int svsm_perform_call_protocol(struct svsm_call *call); + +u8 snp_vmpl; + +/* Include code for early handlers */ +#include "../../boot/startup/sev-shared.c" + +int svsm_perform_call_protocol(struct svsm_call *call) { struct ghcb *ghcb; int ret; @@ -157,17 +75,14 @@ static int svsm_perform_call_protocol(struct svsm_call *call) return ret; } -bool sev_snp_enabled(void) +static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } static void __page_state_change(unsigned long paddr, enum psc_op op) { - u64 val; - - if (!sev_snp_enabled()) - return; + u64 val, msr; /* * If private -> shared then invalidate the page before requesting the @@ -176,6 +91,9 @@ static void __page_state_change(unsigned long paddr, enum psc_op op) if (op == SNP_PAGE_STATE_SHARED) pvalidate_4k_page(paddr, paddr, false); + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + /* Issue VMGEXIT to change the page state in RMP table. */ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); VMGEXIT(); @@ -185,6 +103,9 @@ static void __page_state_change(unsigned long paddr, enum psc_op op) if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + /* * Now that page state is changed in the RMP table, validate it so that it is * consistent with the RMP entry. @@ -195,15 +116,21 @@ static void __page_state_change(unsigned long paddr, enum psc_op op) void snp_set_page_private(unsigned long paddr) { + if (!sev_snp_enabled()) + return; + __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); } void snp_set_page_shared(unsigned long paddr) { + if (!sev_snp_enabled()) + return; + __page_state_change(paddr, SNP_PAGE_STATE_SHARED); } -static bool early_setup_ghcb(void) +bool early_setup_ghcb(void) { if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -214,7 +141,7 @@ static bool early_setup_ghcb(void) boot_ghcb = &boot_ghcb_page; /* Initialize lookup tables for the instruction decoder */ - inat_init_tables(); + sev_insn_decode_init(); /* SNP guest requires the GHCB GPA must be registered */ if (sev_snp_enabled()) @@ -223,56 +150,10 @@ static bool early_setup_ghcb(void) return true; } -static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc, - phys_addr_t pa, phys_addr_t pa_end) -{ - struct psc_hdr *hdr; - struct psc_entry *e; - unsigned int i; - - hdr = &desc->hdr; - memset(hdr, 0, sizeof(*hdr)); - - e = desc->entries; - - i = 0; - while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) { - hdr->end_entry = i; - - e->gfn = pa >> PAGE_SHIFT; - e->operation = SNP_PAGE_STATE_PRIVATE; - if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) { - e->pagesize = RMP_PG_SIZE_2M; - pa += PMD_SIZE; - } else { - e->pagesize = RMP_PG_SIZE_4K; - pa += PAGE_SIZE; - } - - e++; - i++; - } - - if (vmgexit_psc(boot_ghcb, desc)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - pvalidate_pages(desc); - - return pa; -} - void snp_accept_memory(phys_addr_t start, phys_addr_t end) { - struct snp_psc_desc desc = {}; - unsigned int i; - phys_addr_t pa; - - if (!boot_ghcb && !early_setup_ghcb()) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - pa = start; - while (pa < end) - pa = __snp_accept_memory(&desc, pa, end); + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) + __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); } void sev_es_shutdown_ghcb(void) @@ -333,46 +214,6 @@ bool sev_es_check_ghcb_fault(unsigned long address) return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); } -void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) -{ - struct es_em_ctxt ctxt; - enum es_result result; - - if (!boot_ghcb && !early_setup_ghcb()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - vc_ghcb_invalidate(boot_ghcb); - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result != ES_OK) - goto finish; - - result = vc_check_opcode_bytes(&ctxt, exit_code); - if (result != ES_OK) - goto finish; - - switch (exit_code) { - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(boot_ghcb, &ctxt); - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(boot_ghcb, &ctxt); - break; - default: - result = ES_UNSUPPORTED; - break; - } - -finish: - if (result == ES_OK) - vc_finish_insn(&ctxt); - else if (result != ES_RETRY) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - /* * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need * guest side implementation for proper functioning of the guest. If any @@ -682,3 +523,43 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) sev_verify_cbit(top_level_pgt); } + +bool early_is_sevsnp_guest(void) +{ + static bool sevsnp; + + if (sevsnp) + return true; + + if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) + return false; + + sevsnp = true; + + if (!snp_vmpl) { + unsigned int eax, ebx, ecx, edx; + + /* + * CPUID Fn8000_001F_EAX[28] - SVSM support + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax & BIT(28)) { + struct msr m; + + /* Obtain the address of the calling area to use */ + boot_rdmsr(MSR_SVSM_CAA, &m); + boot_svsm_caa = (void *)m.q; + boot_svsm_caa_pa = m.q; + + /* + * The real VMPL level cannot be discovered, but the + * memory acceptance routines make no use of that so + * any non-zero value suffices here. + */ + snp_vmpl = U8_MAX; + } + } + return true; +} diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index fc725a981b09..92f79c21939c 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -10,13 +10,34 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT -bool sev_snp_enabled(void); +#include "../msr.h" + void snp_accept_memory(phys_addr_t start, phys_addr_t end); +u64 sev_get_status(void); +bool early_is_sevsnp_guest(void); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + struct msr m; + + boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + + return m.q; +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + struct msr m; + + m.q = val; + boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); +} #else -static inline bool sev_snp_enabled(void) { return false; } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 sev_get_status(void) { return 0; } +static inline bool early_is_sevsnp_guest(void) { return false; } #endif diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 81fc1eaa3229..9af19d9614cb 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -15,9 +15,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( - "rep ; movsl\n\t" + "rep movsl\n\t" "movl %4,%%ecx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) : "memory"); @@ -29,9 +29,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( - "rep ; movsq\n\t" + "rep movsq\n\t" "movq %4,%%rcx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) : "memory"); diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 6afd05e819d2..3973a67cd04e 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -22,10 +22,10 @@ SYM_FUNC_START_NOALIGN(memcpy) movw %dx, %si pushw %cx shrw $2, %cx - rep; movsl + rep movsl popw %cx andw $3, %cx - rep; movsb + rep movsb popw %di popw %si retl @@ -38,10 +38,10 @@ SYM_FUNC_START_NOALIGN(memset) imull $0x01010101,%eax pushw %cx shrw $2, %cx - rep; stosl + rep stosl popw %cx andw $3, %cx - rep; stosb + rep stosb popw %di retl SYM_FUNC_END(memset) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b5c79f43359b..e30649e44d8f 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -361,12 +361,8 @@ xloadflags: #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_X86_5LEVEL #define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) #else -#define XLF56 XLF_5LEVEL -#endif -#else #define XLF56 0 #endif @@ -585,7 +581,7 @@ start_of_setup: xorl %eax, %eax subw %di, %cx shrw $2, %cx - rep; stosl + rep stosl # Jump to C code (should not return) calll main diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile new file mode 100644 index 000000000000..b514f7e81332 --- /dev/null +++ b/arch/x86/boot/startup/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +KBUILD_AFLAGS += -D__DISABLE_EXPORTS +KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ + -Os -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) \ + -fno-stack-protector -D__NO_FORTIFY \ + -fno-jump-tables \ + -include $(srctree)/include/linux/hidden.h + +# disable ftrace hooks and LTO +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS)) +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +KMSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o + +lib-$(CONFIG_X86_64) += la57toggle.o +lib-$(CONFIG_EFI_MIXED) += efi-mixed.o + +# +# Disable objtool validation for all library code, which is intended +# to be linked into the decompressor or the EFI stub but not vmlinux +# +$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S new file mode 100644 index 000000000000..e04ed99bc449 --- /dev/null +++ b/arch/x86/boot/startup/efi-mixed.S @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT and IDT before we make EFI service + * calls. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identity + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/desc_defs.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .text + .code32 +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +SYM_FUNC_START(efi32_stub_entry) + call 1f +1: popl %ecx + + /* Clear BSS */ + xorl %eax, %eax + leal (_bss - 1b)(%ecx), %edi + leal (_ebss - 1b)(%ecx), %ecx + subl %edi, %ecx + shrl $2, %ecx + cld + rep stosl + + add $0x4, %esp /* Discard return address */ + movl 8(%esp), %ebx /* struct boot_params pointer */ + jmp efi32_startup +SYM_FUNC_END(efi32_stub_entry) +#endif + +/* + * Called using a far call from __efi64_thunk() below, using the x86_64 SysV + * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are + * used instead). EBP+16 points to the arguments passed via the stack. + * + * The first argument (EDI) is a pointer to the boot service or protocol, to + * which the remaining arguments are passed, each truncated to 32 bits. + */ +SYM_FUNC_START_LOCAL(efi_enter32) + /* + * Convert x86-64 SysV ABI params to i386 ABI + */ + pushl 32(%ebp) /* Up to 3 args passed via the stack */ + pushl 24(%ebp) + pushl 16(%ebp) + pushl %ebx /* R9 */ + pushl %eax /* R8 */ + pushl %ecx + pushl %edx + pushl %esi + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + call efi32_enable_long_mode + + addl $32, %esp + movl %edi, %eax + lret +SYM_FUNC_END(efi_enter32) + + .code64 +SYM_FUNC_START(__efi64_thunk) + push %rbp + movl %esp, %ebp + push %rbx + + /* Move args #5 and #6 into 32-bit accessible registers */ + movl %r8d, %eax + movl %r9d, %ebx + + lcalll *efi32_call(%rip) + + pop %rbx + pop %rbp + RET +SYM_FUNC_END(__efi64_thunk) + + .code32 +SYM_FUNC_START_LOCAL(efi32_enable_long_mode) + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Disable interrupts - the firmware's IDT does not work in long mode */ + cli + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + ret +SYM_FUNC_END(efi32_enable_long_mode) + +/* + * This is the common EFI stub entry point for mixed mode. It sets up the GDT + * and page tables needed for 64-bit execution, after which it calls the + * common 64-bit EFI entrypoint efi_stub_entry(). + * + * Arguments: 0(%esp) image handle + * 4(%esp) EFI system table pointer + * %ebx struct boot_params pointer (or NULL) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START_LOCAL(efi32_startup) + movl %esp, %ebp + + subl $8, %esp + sgdtl (%esp) /* Save GDT descriptor to the stack */ + movl 2(%esp), %esi /* Existing GDT pointer */ + movzwl (%esp), %ecx /* Existing GDT limit */ + inc %ecx /* Existing GDT size */ + andl $~7, %ecx /* Ensure size is multiple of 8 */ + + subl %ecx, %esp /* Allocate new GDT */ + andl $~15, %esp /* Realign the stack */ + movl %esp, %edi /* New GDT address */ + leal 7(%ecx), %eax /* New GDT limit */ + pushw %cx /* Push 64-bit CS (for LJMP below) */ + pushl %edi /* Push new GDT address */ + pushw %ax /* Push new GDT limit */ + + /* Copy GDT to the stack and add a 64-bit code segment at the end */ + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx) + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx) + shrl $2, %ecx + cld + rep movsl /* Copy the firmware GDT */ + lgdtl (%esp) /* Switch to the new GDT */ + + call 1f +1: pop %edi + + /* Record mixed mode entry */ + movb $0x0, (efi_is64 - 1b)(%edi) + + /* Set up indirect far call to re-enter 32-bit mode */ + leal (efi32_call - 1b)(%edi), %eax + addl %eax, (%eax) + movw %cs, 4(%eax) + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Set up 1:1 mapping */ + leal (pte - 1b)(%edi), %eax + movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx + leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx +2: movl %ecx, (%eax) + addl $8, %eax + addl $PMD_SIZE, %ecx + jnc 2b + + movl $PAGE_SIZE, %ecx + .irpc l, 0123 + movl %edx, \l * 8(%eax) + addl %ecx, %edx + .endr + addl %ecx, %eax + movl %edx, (%eax) + movl %eax, %cr3 + + call efi32_enable_long_mode + + /* Set up far jump to 64-bit mode (CS is already on the stack) */ + leal (efi_stub_entry - 1b)(%edi), %eax + movl %eax, 2(%esp) + + movl 0(%ebp), %edi + movl 4(%ebp), %esi + movl %ebx, %edx + ljmpl *2(%esp) +SYM_FUNC_END(efi32_startup) + +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ +SYM_FUNC_START(efi32_pe_entry) + pushl %ebx // save callee-save registers + + /* Check whether the CPU supports long mode */ + movl $0x80000001, %eax // assume extended info support + cpuid + btl $29, %edx // check long mode bit + jnc 1f + leal 8(%esp), %esp // preserve stack alignment + xor %ebx, %ebx // no struct boot_params pointer + jmp efi32_startup // only ESP and EBX remain live +1: movl $0x80000003, %eax // EFI_UNSUPPORTED + popl %ebx + RET +SYM_FUNC_END(efi32_pe_entry) + +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL + .org efi32_stub_entry + 0x200 + .code64 +SYM_FUNC_START_NOALIGN(efi64_stub_entry) + jmp efi_handover_entry +SYM_FUNC_END(efi64_stub_entry) +#endif + + .data + .balign 8 +SYM_DATA_START_LOCAL(efi32_call) + .long efi_enter32 - . + .word 0x0 +SYM_DATA_END(efi32_call) +SYM_DATA(efi_is64, .byte 1) + + .bss + .balign PAGE_SIZE +SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0) diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c new file mode 100644 index 000000000000..a3112a69b06a --- /dev/null +++ b/arch/x86/boot/startup/gdt_idt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <linux/types.h> + +#include <asm/desc.h> +#include <asm/init.h> +#include <asm/setup.h> +#include <asm/sev.h> +#include <asm/trapnr.h> + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +/* This may run while still in the direct mapping */ +void __head startup_64_load_idt(void *vc_handler) +{ + struct desc_ptr desc = { + .address = (unsigned long)rip_rel_ptr(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; + struct idt_data data; + gate_desc idt_desc; + + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); + } + + native_load_idt(&desc); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_gdt_idt(void) +{ + struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)gp->gdt, + .size = GDT_SIZE - 1, + }; + + /* Load GDT */ + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = rip_rel_ptr(vc_no_ghcb); + + startup_64_load_idt(handler); +} diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/startup/la57toggle.S index 9ee002387eb1..370075b4d95b 100644 --- a/arch/x86/boot/compressed/la57toggle.S +++ b/arch/x86/boot/startup/la57toggle.S @@ -5,7 +5,6 @@ #include <asm/boot.h> #include <asm/msr.h> #include <asm/processor-flags.h> -#include "pgtable.h" /* * This is the 32-bit trampoline that will be copied over to low memory. It diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c new file mode 100644 index 000000000000..332dbe6688c4 --- /dev/null +++ b/arch/x86/boot/startup/map_kernel.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include <asm/init.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sev.h> + +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +extern unsigned int next_early_pgt; + +static inline bool check_la57_support(void) +{ + /* + * 5-level paging is detected and enabled at kernel decompression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) + return false; + + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + + return true; +} + +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) +{ + unsigned long paddr, paddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted); + paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted); + + for (; paddr < paddr_end; paddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use the PA to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); + + i = pmd_index(paddr - p2v_offset); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +/* + * This code is compiled using PIC codegen because it will execute from the + * early 1:1 mapping of memory, which deviates from the mapping expected by the + * linker. Due to this deviation, taking the address of a global variable will + * produce an ambiguous result when using the plain & operator. Instead, + * rip_rel_ptr() must be used, which will return the RIP-relative address in + * the 1:1 mapping of memory. Kernel virtual addresses can be determined by + * subtracting p2v_offset from the RIP-relative address. + */ +unsigned long __head __startup_64(unsigned long p2v_offset, + struct boot_params *bp) +{ + pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)rip_rel_ptr(_text); + unsigned long va_text, va_end; + unsigned long pgtable_flags; + unsigned long load_delta; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + bool la57; + int i; + + la57 = check_la57_support(); + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + phys_base = load_delta = __START_KERNEL_map + p2v_offset; + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_MASK) + for (;;); + + va_text = physaddr - p2v_offset; + va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset; + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = rip_rel_ptr(early_top_pgt); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (la57) { + p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; + } + + level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta; + level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta; + + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + level2_fixmap_pgt[i].pmd += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + next_early_pgt = 2; + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (la57) { + p4d = &early_pgts[next_early_pgt++]->pmd; + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. + */ + + pmd = rip_rel_ptr(level2_kernel_pgt); + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index(va_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index(va_end); i++) + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; + + return sme_postprocess_startup(bp, pmd, p2v_offset); +} diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/boot/startup/sev-shared.c index 2e4122f8aa6b..7a706db87b93 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -14,76 +14,23 @@ #ifndef __BOOT_COMPRESSED #define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) -#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) -#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#define sev_printk(fmt, ...) -#define sev_printk_rtl(fmt, ...) #undef vc_forward_exception #define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif /* * SVSM related information: - * When running under an SVSM, the VMPL that Linux is executing at must be - * non-zero. The VMPL is therefore used to indicate the presence of an SVSM. - * * During boot, the page tables are set up as identity mapped and later * changed to use kernel virtual addresses. Maintain separate virtual and * physical addresses for the CAA to allow SVSM functions to be used during * early boot, both with identity mapped virtual addresses and proper kernel * virtual addresses. */ -u8 snp_vmpl __ro_after_init; -EXPORT_SYMBOL_GPL(snp_vmpl); -static struct svsm_ca *boot_svsm_caa __ro_after_init; -static u64 boot_svsm_caa_pa __ro_after_init; - -static struct svsm_ca *svsm_get_caa(void); -static u64 svsm_get_caa_pa(void); -static int svsm_perform_call_protocol(struct svsm_call *call); - -/* I/O parameters for CPUID-related helpers */ -struct cpuid_leaf { - u32 fn; - u32 subfn; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; -}; - -/* - * Individual entries of the SNP CPUID table, as defined by the SNP - * Firmware ABI, Revision 0.9, Section 7.1, Table 14. - */ -struct snp_cpuid_fn { - u32 eax_in; - u32 ecx_in; - u64 xcr0_in; - u64 xss_in; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u64 __reserved; -} __packed; - -/* - * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, - * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit - * of 64 entries per CPUID table. - */ -#define SNP_CPUID_COUNT_MAX 64 - -struct snp_cpuid_table { - u32 count; - u32 __reserved1; - u64 __reserved2; - struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; -} __packed; +struct svsm_ca *boot_svsm_caa __ro_after_init; +u64 boot_svsm_caa_pa __ro_after_init; /* * Since feature negotiation related variables are set early in the boot @@ -107,7 +54,7 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -static bool __init sev_es_check_cpu_features(void) +bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { error("RDRAND instruction not supported - no trusted source of randomness available\n"); @@ -117,7 +64,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __head __noreturn +void __head __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -136,7 +83,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -static u64 get_hv_features(void) +u64 get_hv_features(void) { u64 val; @@ -153,7 +100,7 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -static void snp_register_ghcb_early(unsigned long paddr) +void snp_register_ghcb_early(unsigned long paddr) { unsigned long pfn = paddr >> PAGE_SHIFT; u64 val; @@ -169,7 +116,7 @@ static void snp_register_ghcb_early(unsigned long paddr) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); } -static bool sev_es_negotiate_protocol(void) +bool sev_es_negotiate_protocol(void) { u64 val; @@ -190,39 +137,6 @@ static bool sev_es_negotiate_protocol(void) return true; } -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - ghcb->save.sw_exit_code = 0; - __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { u32 ret; @@ -344,7 +258,7 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) * Fill in protocol and format specifiers. This can be called very early * in the boot, so use rip-relative references as needed. */ - ghcb->protocol_version = RIP_REL_REF(ghcb_version); + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); @@ -371,10 +285,10 @@ static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) return svsm_process_result_codes(call); } -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) { /* Fill in protocol and format specifiers */ ghcb->protocol_version = ghcb_version; @@ -473,9 +387,9 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid * while running with the initial identity mapping as well as the * switch-over to kernel virtual addresses later. */ -static const struct snp_cpuid_table *snp_cpuid_get_table(void) +const struct snp_cpuid_table *snp_cpuid_get_table(void) { - return &RIP_REL_REF(cpuid_table_copy); + return rip_rel_ptr(&cpuid_table_copy); } /* @@ -672,7 +586,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -static int __head +int __head snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -701,9 +615,9 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; /* Skip post-processing for out-of-range zero leafs. */ - if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) || - (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) || - (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max)))) + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) return 0; } @@ -782,391 +696,6 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } -static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, - unsigned long address, - bool write) -{ - if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_USER; - ctxt->fi.cr2 = address; - if (write) - ctxt->fi.error_code |= X86_PF_WRITE; - - return ES_EXCEPTION; - } - - return ES_OK; -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - unsigned long address = (unsigned long)src; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, false); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - unsigned long address = (unsigned long)dst; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, true); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - size_t size; - u64 port; - - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - port = ctxt->regs->dx & 0xffff; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - port = ctxt->regs->dx & 0xffff; - break; - - default: - return ES_DECODE_FAILED; - } - - *exitinfo |= port << 16; - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - size = 1; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - size = (insn->opnd_bytes == 2) ? 2 : 4; - } - - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return vc_ioio_check(ctxt, (u16)port, size); -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - struct cpuid_leaf leaf; - int ret; - - leaf.fn = regs->ax; - leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); - if (!ret) { - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - int snp_cpuid_ret; - - snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); - if (!snp_cpuid_ret) - return ES_OK; - if (snp_cpuid_ret != -EOPNOTSUPP) - return ES_VMM_ERROR; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - /* - * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure - * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP - * instructions are being intercepted. If this should occur and Secure - * TSC is enabled, guest execution should be terminated as the guest - * cannot rely on the TSC value provided by the hypervisor. - */ - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) - return ES_VMM_ERROR; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} - struct cc_setup_data { struct setup_data header; u32 cc_blob_address; @@ -1224,36 +753,14 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; if (fn->eax_in == 0x0) - RIP_REL_REF(cpuid_std_range_max) = fn->eax; + cpuid_std_range_max = fn->eax; else if (fn->eax_in == 0x40000000) - RIP_REL_REF(cpuid_hyp_range_max) = fn->eax; + cpuid_hyp_range_max = fn->eax; else if (fn->eax_in == 0x80000000) - RIP_REL_REF(cpuid_ext_range_max) = fn->eax; + cpuid_ext_range_max = fn->eax; } } -static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, - int ret, u64 svsm_ret) -{ - WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n", - pfn, action, page_size, ret, svsm_ret); - - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); -} - -static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret) -{ - unsigned int page_size; - bool action; - u64 pfn; - - pfn = pc->entry[pc->cur_index].pfn; - action = pc->entry[pc->cur_index].action; - page_size = pc->entry[pc->cur_index].page_size; - - __pval_terminate(pfn, action, page_size, ret, svsm_ret); -} - static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; @@ -1296,11 +803,7 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, { int ret; - /* - * This can be called very early during boot, so use rIP-relative - * references as needed. - */ - if (RIP_REL_REF(snp_vmpl)) { + if (snp_vmpl) { svsm_pval_4k_page(paddr, validate); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); @@ -1309,351 +812,6 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, } } -static void pval_pages(struct snp_psc_desc *desc) -{ - struct psc_entry *e; - unsigned long vaddr; - unsigned int size; - unsigned int i; - bool validate; - u64 pfn; - int rc; - - for (i = 0; i <= desc->hdr.end_entry; i++) { - e = &desc->entries[i]; - - pfn = e->gfn; - vaddr = (unsigned long)pfn_to_kaddr(pfn); - size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - validate = e->operation == SNP_PAGE_STATE_PRIVATE; - - rc = pvalidate(vaddr, size, validate); - if (!rc) - continue; - - if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { - unsigned long vaddr_end = vaddr + PMD_SIZE; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (rc) - __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0); - } - } else { - __pval_terminate(pfn, validate, size, rc, 0); - } - } -} - -static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action, - struct svsm_pvalidate_call *pc) -{ - struct svsm_pvalidate_entry *pe; - - /* Nothing in the CA yet */ - pc->num_entries = 0; - pc->cur_index = 0; - - pe = &pc->entry[0]; - - while (pfn < pfn_end) { - pe->page_size = RMP_PG_SIZE_4K; - pe->action = action; - pe->ignore_cf = 0; - pe->pfn = pfn; - - pe++; - pfn++; - - pc->num_entries++; - if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) - break; - } - - return pfn; -} - -static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry, - struct svsm_pvalidate_call *pc) -{ - struct svsm_pvalidate_entry *pe; - struct psc_entry *e; - - /* Nothing in the CA yet */ - pc->num_entries = 0; - pc->cur_index = 0; - - pe = &pc->entry[0]; - e = &desc->entries[desc_entry]; - - while (desc_entry <= desc->hdr.end_entry) { - pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - pe->action = e->operation == SNP_PAGE_STATE_PRIVATE; - pe->ignore_cf = 0; - pe->pfn = e->gfn; - - pe++; - e++; - - desc_entry++; - pc->num_entries++; - if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) - break; - } - - return desc_entry; -} - -static void svsm_pval_pages(struct snp_psc_desc *desc) -{ - struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY]; - unsigned int i, pv_4k_count = 0; - struct svsm_pvalidate_call *pc; - struct svsm_call call = {}; - unsigned long flags; - bool action; - u64 pc_pa; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - /* - * The SVSM calling area (CA) can support processing 510 entries at a - * time. Loop through the Page State Change descriptor until the CA is - * full or the last entry in the descriptor is reached, at which time - * the SVSM is invoked. This repeats until all entries in the descriptor - * are processed. - */ - call.caa = svsm_get_caa(); - - pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); - - /* Protocol 0, Call ID 1 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); - call.rcx = pc_pa; - - for (i = 0; i <= desc->hdr.end_entry;) { - i = svsm_build_ca_from_psc_desc(desc, i, pc); - - do { - ret = svsm_perform_call_protocol(&call); - if (!ret) - continue; - - /* - * Check if the entry failed because of an RMP mismatch (a - * PVALIDATE at 2M was requested, but the page is mapped in - * the RMP as 4K). - */ - - if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH && - pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) { - /* Save this entry for post-processing at 4K */ - pv_4k[pv_4k_count++] = pc->entry[pc->cur_index]; - - /* Skip to the next one unless at the end of the list */ - pc->cur_index++; - if (pc->cur_index < pc->num_entries) - ret = -EAGAIN; - else - ret = 0; - } - } while (ret == -EAGAIN); - - if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); - } - - /* Process any entries that failed to be validated at 2M and validate them at 4K */ - for (i = 0; i < pv_4k_count; i++) { - u64 pfn, pfn_end; - - action = pv_4k[i].action; - pfn = pv_4k[i].pfn; - pfn_end = pfn + 512; - - while (pfn < pfn_end) { - pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc); - - ret = svsm_perform_call_protocol(&call); - if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); - } - } - - native_local_irq_restore(flags); -} - -static void pvalidate_pages(struct snp_psc_desc *desc) -{ - if (snp_vmpl) - svsm_pval_pages(desc); - else - pval_pages(desc); -} - -static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) -{ - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; - struct es_em_ctxt ctxt; - - vc_ghcb_invalidate(ghcb); - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - return ret; -} - -static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; - u8 modrm = ctxt->insn.modrm.value; - - switch (exit_code) { - - case SVM_EXIT_IOIO: - case SVM_EXIT_NPF: - /* handled separately */ - return ES_OK; - - case SVM_EXIT_CPUID: - if (opcode == 0xa20f) - return ES_OK; - break; - - case SVM_EXIT_INVD: - if (opcode == 0x080f) - return ES_OK; - break; - - case SVM_EXIT_MONITOR: - /* MONITOR and MONITORX instructions generate the same error code */ - if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) - return ES_OK; - break; - - case SVM_EXIT_MWAIT: - /* MWAIT and MWAITX instructions generate the same error code */ - if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) - return ES_OK; - break; - - case SVM_EXIT_MSR: - /* RDMSR */ - if (opcode == 0x320f || - /* WRMSR */ - opcode == 0x300f) - return ES_OK; - break; - - case SVM_EXIT_RDPMC: - if (opcode == 0x330f) - return ES_OK; - break; - - case SVM_EXIT_RDTSC: - if (opcode == 0x310f) - return ES_OK; - break; - - case SVM_EXIT_RDTSCP: - if (opcode == 0x010f && modrm == 0xf9) - return ES_OK; - break; - - case SVM_EXIT_READ_DR7: - if (opcode == 0x210f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_VMMCALL: - if (opcode == 0x010f && modrm == 0xd9) - return ES_OK; - - break; - - case SVM_EXIT_WRITE_DR7: - if (opcode == 0x230f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_WBINVD: - if (opcode == 0x90f) - return ES_OK; - break; - - default: - break; - } - - sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", - opcode, exit_code, ctxt->regs->ip); - - return ES_UNSUPPORTED; -} - /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. @@ -1681,7 +839,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)&RIP_REL_REF(boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) return false; /* @@ -1698,7 +856,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) if (!secrets_page->svsm_guest_vmpl) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0); - RIP_REL_REF(snp_vmpl) = secrets_page->svsm_guest_vmpl; + snp_vmpl = secrets_page->svsm_guest_vmpl; caa = secrets_page->svsm_caa; @@ -1713,8 +871,8 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * The CA is identity mapped when this routine is called, both by the * decompressor code and the early kernel code. */ - RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)caa; - RIP_REL_REF(boot_svsm_caa_pa) = caa; + boot_svsm_caa = (struct svsm_ca *)caa; + boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table(); diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c new file mode 100644 index 000000000000..0b7e3b950183 --- /dev/null +++ b/arch/x86/boot/startup/sev-startup.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/percpu-defs.h> +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> + +#include <asm/init.h> +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid/api.h> +#include <asm/cmdline.h> + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); + +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + +int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + /* + * This can be called very early in the boot, use native functions in + * order to avoid paravirt issues. + */ + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + +void __head +early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + vaddr = vaddr & PAGE_MASK; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* Page validation must be rescinded before changing to shared */ + if (op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false); + + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) + goto e_term; + + if (GHCB_MSR_PSC_RESP_VAL(val)) + goto e_term; + + /* Page validation must be performed after changing to private */ + if (op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true); + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +{ + struct svsm_call call = {}; + int ret; + u64 pa; + + /* + * Record the SVSM Calling Area address (CAA) if the guest is not + * running at VMPL0. The CA will be used to communicate with the + * SVSM to perform the SVSM services. + */ + if (!svsm_setup_ca(cc_info)) + return; + + /* + * It is very early in the boot and the kernel is running identity + * mapped but without having adjusted the pagetables to where the + * kernel was loaded (physbase), so the get the CA address using + * RIP-relative addressing. + */ + pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); + + /* + * Switch over to the boot SVSM CA while the current CA is still + * addressable. There is no GHCB at this point so use the MSR protocol. + * + * SVSM_CORE_REMAP_CA call: + * RAX = 0 (Protocol=0, CallID=0) + * RCX = New CA GPA + */ + call.caa = svsm_get_caa(); + call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); + call.rcx = pa; + ret = svsm_perform_call_protocol(&call); + if (ret) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); + + boot_svsm_caa = (struct svsm_ca *)pa; + boot_svsm_caa_pa = pa; +} + +bool __head snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) + sev_secrets_pa = cc_info->secrets_phys; + else + return false; + + setup_cpuid_table(cc_info); + + svsm_setup(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __head __noreturn snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/boot/startup/sme.c index 5eecdd92da10..70ea1748c0a7 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/boot/startup/sme.c @@ -45,8 +45,6 @@ #include <asm/coco.h> #include <asm/sev.h> -#include "mm_internal.h" - #define PGD_FLAGS _KERNPG_TABLE_NOENC #define P4D_FLAGS _KERNPG_TABLE_NOENC #define PUD_FLAGS _KERNPG_TABLE_NOENC @@ -299,8 +297,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * instrumentation or checking boot_cpu_data in the cc_platform_has() * function. */ - if (!sme_get_me_mask() || - RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED) + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) return; /* @@ -318,8 +315,8 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * memory from being cached. */ - kernel_start = (unsigned long)RIP_REL_REF(_text); - kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE); + kernel_start = (unsigned long)rip_rel_ptr(_text); + kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE); kernel_len = kernel_end - kernel_start; initrd_start = 0; @@ -345,7 +342,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) * pagetable structures for the encryption of the kernel * pagetable structures for workarea (in case not currently mapped) */ - execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea); + execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea); execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; execute_len = execute_end - execute_start; @@ -526,7 +523,7 @@ void __head sme_enable(struct boot_params *bp) me_mask = 1UL << (ebx & 0x3f); /* Check the SEV MSR whether SEV or SME is enabled */ - RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV); + sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* @@ -557,13 +554,22 @@ void __head sme_enable(struct boot_params *bp) return; /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_AMD64_SYSCFG); + msr = native_rdmsrq(MSR_AMD64_SYSCFG); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; } - RIP_REL_REF(sme_me_mask) = me_mask; - RIP_REL_REF(physical_mask) &= ~me_mask; - RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; + sme_me_mask = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; cc_set_mask(me_mask); } + +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +/* Local version for startup code, which never operates on user page tables */ +__weak +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 84f7a883ce1e..f35369bb14c5 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -32,7 +32,7 @@ int memcmp(const void *s1, const void *s2, size_t len) { bool diff; - asm("repe; cmpsb" CC_SET(nz) + asm("repe cmpsb" CC_SET(nz) : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f2e96905b3fe..0641c8c46aee 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -292,7 +292,7 @@ static void restore_screen(void) "shrw %%cx ; " "jnc 1f ; " "stosw \n\t" - "1: rep;stosl ; " + "1: rep stosl ; " "popw %%es" : "+D" (dst), "+c" (npad) : "bdS" (video_segment), diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 9a0ddda3aa69..d4610af68114 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -18,7 +18,9 @@ #include <asm/processor.h> enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; +SYM_PIC_ALIAS(cc_vendor); u64 cc_mask __ro_after_init; +SYM_PIC_ALIAS(cc_mask); static struct cc_attr_flags { __u64 host_sev_snp : 1, diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index dcb06dc8b5ae..db3255b979bd 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,22 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o - -# jump tables are emitted using absolute references in non-PIC code -# so they cannot be used in the early SEV startup code -CFLAGS_core.o += -fno-jump-tables - -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = -pg -endif - -KASAN_SANITIZE_core.o := n -KMSAN_SANITIZE_core.o := n -KCOV_INSTRUMENT_core.o := n - -# With some compiler versions the generated code results in boot hangs, caused -# by several compilation units. To be safe, disable all instrumentation. -KCSAN_SANITIZE := n +obj-y += core.o sev-nmi.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE := n +UBSAN_SANITIZE_sev-nmi.o := n + +# GCC may fail to respect __no_sanitize_address when inlining +KASAN_SANITIZE_sev-nmi.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b0c1a7a57497..b2569257acd3 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -31,6 +31,7 @@ #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> #include <asm/sev.h> +#include <asm/sev-internal.h> #include <asm/insn-eval.h> #include <asm/fpu/xcr.h> #include <asm/processor.h> @@ -41,10 +42,9 @@ #include <asm/smp.h> #include <asm/cpu.h> #include <asm/apic.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/cmdline.h> - -#define DR7_RESET_VALUE 0x400 +#include <asm/msr.h> /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff @@ -81,21 +81,6 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", }; -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -static u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -static u64 secrets_pa __ro_after_init; - /* * For Secure TSC guests, the BSP fetches TSC_INFO using SNP guest messaging and * initializes snp_tsc_scale and snp_tsc_offset. These values are replicated @@ -105,558 +90,196 @@ static u64 snp_tsc_scale __ro_after_init; static u64 snp_tsc_offset __ro_after_init; static u64 snp_tsc_freq_khz __ro_after_init; -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -/* For early boot SVSM communication */ -static struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); -static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -static DEFINE_PER_CPU(u64, svsm_caa_pa); - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} +DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); /* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. + * SVSM related information: + * When running under an SVSM, the VMPL that Linux is executing at must be + * non-zero. The VMPL is therefore used to indicate the presence of an SVSM. */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} +u8 snp_vmpl __ro_after_init; +EXPORT_SYMBOL_GPL(snp_vmpl); -void noinstr __sev_es_ist_exit(void) +static u64 __init get_snp_jump_table_addr(void) { - unsigned long ist; + struct snp_secrets_page *secrets; + void __iomem *mem; + u64 addr; - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; + secrets = (__force struct snp_secrets_page *)mem; - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); + addr = secrets->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; } -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +static u64 __init get_jump_table_addr(void) { - struct sev_es_runtime_data *data; + struct ghcb_state state; + unsigned long flags; struct ghcb *ghcb; + u64 ret = 0; - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); - state->ghcb = &data->backup_ghcb; + local_irq_save(flags); - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } + ghcb = __sev_get_ghcb(&state); - return ghcb; -} + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; - low = (u32)(val); - high = (u32)(val >> 32); + __sev_put_ghcb(&state); - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} + local_irq_restore(flags); -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + return ret; } -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, + int ret, u64 svsm_ret) { - char buffer[MAX_INSN_SIZE]; - int insn_bytes; - - insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (insn_bytes == 0) { - /* Nothing could be copied */ - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } else if (insn_bytes == -EINVAL) { - /* Effective RIP could not be calculated */ - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - ctxt->fi.cr2 = 0; - return ES_EXCEPTION; - } + WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n", + pfn, action, page_size, ret, svsm_ret); - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) - return ES_DECODE_FAILED; - - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); } -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret) { - char buffer[MAX_INSN_SIZE]; - int res, ret; + unsigned int page_size; + bool action; + u64 pfn; - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + pfn = pc->entry[pc->cur_index].pfn; + action = pc->entry[pc->cur_index].action; + page_size = pc->entry[pc->cur_index].page_size; - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; + __pval_terminate(pfn, action, page_size, ret, svsm_ret); } -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +static void pval_pages(struct snp_psc_desc *desc) { - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} + struct psc_entry *e; + unsigned long vaddr; + unsigned int size; + unsigned int i; + bool validate; + u64 pfn; + int rc; -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + for (i = 0; i <= desc->hdr.end_entry; i++) { + e = &desc->entries[i]; - /* - * This function uses __put_user() independent of whether kernel or user - * memory is accessed. This works fine because __put_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __put_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_to_user() here because - * vc_write_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *target = (u8 __user *)dst; - - memcpy(&d1, buf, 1); - if (__put_user(d1, target)) - goto fault; - break; - } - case 2: { - u16 d2; - u16 __user *target = (u16 __user *)dst; + pfn = e->gfn; + vaddr = (unsigned long)pfn_to_kaddr(pfn); + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + validate = e->operation == SNP_PAGE_STATE_PRIVATE; - memcpy(&d2, buf, 2); - if (__put_user(d2, target)) - goto fault; - break; - } - case 4: { - u32 d4; - u32 __user *target = (u32 __user *)dst; + rc = pvalidate(vaddr, size, validate); + if (!rc) + continue; - memcpy(&d4, buf, 4); - if (__put_user(d4, target)) - goto fault; - break; - } - case 8: { - u64 d8; - u64 __user *target = (u64 __user *)dst; + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { + unsigned long vaddr_end = vaddr + PMD_SIZE; - memcpy(&d8, buf, 8); - if (__put_user(d8, target)) - goto fault; - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (rc) + __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0); + } + } else { + __pval_terminate(pfn, validate, size, rc, 0); + } } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; } -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) +static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action, + struct svsm_pvalidate_call *pc) { - unsigned long error_code = X86_PF_PROT; + struct svsm_pvalidate_entry *pe; - /* - * This function uses __get_user() independent of whether kernel or user - * memory is accessed. This works fine because __get_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __get_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_from_user() here because - * vc_read_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *s = (u8 __user *)src; - - if (__get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - } - case 2: { - u16 d2; - u16 __user *s = (u16 __user *)src; - - if (__get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - } - case 4: { - u32 d4; - u32 __user *s = (u32 __user *)src; - - if (__get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - } - case 8: { - u64 d8; - u64 __user *s = (u64 __user *)src; - if (__get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } + /* Nothing in the CA yet */ + pc->num_entries = 0; + pc->cur_index = 0; - return ES_OK; + pe = &pc->entry[0]; -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; + while (pfn < pfn_end) { + pe->page_size = RMP_PG_SIZE_4K; + pe->action = action; + pe->ignore_cf = 0; + pe->pfn = pfn; - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; - - return ES_EXCEPTION; -} - -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; + pe++; + pfn++; - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; - - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; + pc->num_entries++; + if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) + break; } - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; + return pfn; } -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry, + struct svsm_pvalidate_call *pc) { - BUG_ON(size > 4); - - if (user_mode(ctxt->regs)) { - struct thread_struct *t = ¤t->thread; - struct io_bitmap *iobm = t->io_bitmap; - size_t idx; - - if (!iobm) - goto fault; - - for (idx = port; idx < port + size; ++idx) { - if (test_bit(idx, iobm->bitmap)) - goto fault; - } - } + struct svsm_pvalidate_entry *pe; + struct psc_entry *e; - return ES_OK; + /* Nothing in the CA yet */ + pc->num_entries = 0; + pc->cur_index = 0; -fault: - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; + pe = &pc->entry[0]; + e = &desc->entries[desc_entry]; - return ES_EXCEPTION; -} + while (desc_entry <= desc->hdr.end_entry) { + pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; + pe->action = e->operation == SNP_PAGE_STATE_PRIVATE; + pe->ignore_cf = 0; + pe->pfn = e->gfn; -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; - - ctxt->regs->orig_ax = ctxt->fi.error_code; + pe++; + e++; - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_PF: - write_cr2(ctxt->fi.cr2); - exc_page_fault(ctxt->regs, error_code); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); + desc_entry++; + pc->num_entries++; + if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT) + break; } -} - -/* Include code shared with pre-decompression boot stage */ -#include "shared.c" - -static inline struct svsm_ca *svsm_get_caa(void) -{ - /* - * Use rIP-relative references when called early in the boot. If - * ->use_cas is set, then it is late in the boot and no need - * to worry about rIP-relative references. - */ - if (RIP_REL_REF(sev_cfg).use_cas) - return this_cpu_read(svsm_caa); - else - return RIP_REL_REF(boot_svsm_caa); -} - -static u64 svsm_get_caa_pa(void) -{ - /* - * Use rIP-relative references when called early in the boot. If - * ->use_cas is set, then it is late in the boot and no need - * to worry about rIP-relative references. - */ - if (RIP_REL_REF(sev_cfg).use_cas) - return this_cpu_read(svsm_caa_pa); - else - return RIP_REL_REF(boot_svsm_caa_pa); -} -static noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } + return desc_entry; } -static int svsm_perform_call_protocol(struct svsm_call *call) +static void svsm_pval_pages(struct snp_psc_desc *desc) { - struct ghcb_state state; + struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY]; + unsigned int i, pv_4k_count = 0; + struct svsm_pvalidate_call *pc; + struct svsm_call call = {}; unsigned long flags; - struct ghcb *ghcb; + bool action; + u64 pc_pa; int ret; /* @@ -666,180 +289,145 @@ static int svsm_perform_call_protocol(struct svsm_call *call) flags = native_local_irq_save(); /* - * Use rip-relative references when called early in the boot. If - * ghcbs_initialized is set, then it is late in the boot and no need - * to worry about rip-relative references in called functions. + * The SVSM calling area (CA) can support processing 510 entries at a + * time. Loop through the Page State Change descriptor until the CA is + * full or the last entry in the descriptor is reached, at which time + * the SVSM is invoked. This repeats until all entries in the descriptor + * are processed. */ - if (RIP_REL_REF(sev_cfg).ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (RIP_REL_REF(boot_ghcb)) - ghcb = RIP_REL_REF(boot_ghcb); - else - ghcb = NULL; + call.caa = svsm_get_caa(); - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); + pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; + pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); - if (RIP_REL_REF(sev_cfg).ghcbs_initialized) - __sev_put_ghcb(&state); + /* Protocol 0, Call ID 1 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); + call.rcx = pc_pa; - native_local_irq_restore(flags); + for (i = 0; i <= desc->hdr.end_entry;) { + i = svsm_build_ca_from_psc_desc(desc, i, pc); - return ret; -} + do { + ret = svsm_perform_call_protocol(&call); + if (!ret) + continue; -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; + /* + * Check if the entry failed because of an RMP mismatch (a + * PVALIDATE at 2M was requested, but the page is mapped in + * the RMP as 4K). + */ - ghcb = __sev_get_ghcb(&state); + if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH && + pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) { + /* Save this entry for post-processing at 4K */ + pv_4k[pv_4k_count++] = pc->entry[pc->cur_index]; + + /* Skip to the next one unless at the end of the list */ + pc->cur_index++; + if (pc->cur_index < pc->num_entries) + ret = -EAGAIN; + else + ret = 0; + } + } while (ret == -EAGAIN); - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); + if (ret) + svsm_pval_terminate(pc, ret, call.rax_out); + } - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); + /* Process any entries that failed to be validated at 2M and validate them at 4K */ + for (i = 0; i < pv_4k_count; i++) { + u64 pfn, pfn_end; - __sev_put_ghcb(&state); -} + action = pv_4k[i].action; + pfn = pv_4k[i].pfn; + pfn_end = pfn + 512; -static u64 __init get_snp_jump_table_addr(void) -{ - struct snp_secrets_page *secrets; - void __iomem *mem; - u64 addr; + while (pfn < pfn_end) { + pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc); - mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); - if (!mem) { - pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); - return 0; + ret = svsm_perform_call_protocol(&call); + if (ret) + svsm_pval_terminate(pc, ret, call.rax_out); + } } - secrets = (__force struct snp_secrets_page *)mem; - - addr = secrets->os_area.ap_jump_table_pa; - iounmap(mem); - - return addr; + native_local_irq_restore(flags); } -static u64 __init get_jump_table_addr(void) +static void pvalidate_pages(struct snp_psc_desc *desc) { - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return get_snp_jump_table_addr(); - - local_irq_save(flags); + if (snp_vmpl) + svsm_pval_pages(desc); + else + pval_pages(desc); +} - ghcb = __sev_get_ghcb(&state); +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct es_em_ctxt ctxt; vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - return ret; -} - -static void __head -early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) -{ - unsigned long paddr_end; - u64 val; - - vaddr = vaddr & PAGE_MASK; + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; - paddr = paddr & PAGE_MASK; - paddr_end = paddr + (npages << PAGE_SHIFT); + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. + * Page State Change VMGEXIT can pass error code through + * exit_info_2. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } - vaddr += PAGE_SIZE; - paddr += PAGE_SIZE; + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } } - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); -} - -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* - * Ask the hypervisor to mark the memory pages as private in the RMP - * table. - */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); +out: + return ret; } static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, @@ -959,6 +547,102 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end) set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); } +static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id) +{ + bool create = event != SVM_VMGEXIT_AP_DESTROY; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret = 0; + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + + if (create) + ghcb_set_rax(ghcb, vmsa->sev_features); + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, + ((u64)apic_id << 32) | + ((u64)snp_vmpl << 16) | + event); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY")); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) +{ + int ret; + + if (snp_vmpl) { + struct svsm_call call = {}; + unsigned long flags; + + local_irq_save(flags); + + call.caa = this_cpu_read(svsm_caa); + call.rcx = __pa(va); + + if (make_vmsa) { + /* Protocol 0, Call ID 2 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); + call.rdx = __pa(caa); + call.r8 = apic_id; + } else { + /* Protocol 0, Call ID 3 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); + } + + ret = svsm_perform_call_protocol(&call); + + local_irq_restore(flags); + } else { + /* + * If the kernel runs at VMPL0, it can change the VMSA + * bit for a page using the RMPADJUST instruction. + * However, for the instruction to succeed it must + * target the permissions of a lesser privileged (higher + * numbered) VMPL level, so use VMPL1. + */ + u64 attrs = 1; + + if (make_vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); + } + + return ret; +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) +{ + int err; + + err = snp_set_vmsa(vmsa, NULL, apic_id, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + static void set_pte_enc(pte_t *kpte, int level, void *va) { struct pte_enc_desc d = { @@ -1005,7 +689,8 @@ static void unshare_all_memory(void) data = per_cpu(runtime_data, cpu); ghcb = (unsigned long)&data->ghcb_page; - if (addr <= ghcb && ghcb <= addr + size) { + /* Handle the case of a huge page containing the GHCB page */ + if (addr <= ghcb && ghcb < addr + size) { skipped_addr = true; break; } @@ -1055,11 +740,70 @@ void snp_kexec_begin(void) pr_warn("Failed to stop shared<->private conversions\n"); } +/* + * Shutdown all APs except the one handling kexec/kdump and clearing + * the VMSA tag on AP's VMSA pages as they are not being used as + * VMSA page anymore. + */ +static void shutdown_all_aps(void) +{ + struct sev_es_save_area *vmsa; + int apic_id, this_cpu, cpu; + + this_cpu = get_cpu(); + + /* + * APs are already in HLT loop when enc_kexec_finish() callback + * is invoked. + */ + for_each_present_cpu(cpu) { + vmsa = per_cpu(sev_vmsa, cpu); + + /* + * The BSP or offlined APs do not have guest allocated VMSA + * and there is no need to clear the VMSA tag for this page. + */ + if (!vmsa) + continue; + + /* + * Cannot clear the VMSA tag for the currently running vCPU. + */ + if (this_cpu == cpu) { + unsigned long pa; + struct page *p; + + pa = __pa(vmsa); + /* + * Mark the VMSA page of the running vCPU as offline + * so that is excluded and not touched by makedumpfile + * while generating vmcore during kdump. + */ + p = pfn_to_online_page(pa >> PAGE_SHIFT); + if (p) + __SetPageOffline(p); + continue; + } + + apic_id = cpuid_to_apicid[cpu]; + + /* + * Issue AP destroy to ensure AP gets kicked out of guest mode + * to allow using RMPADJUST to remove the VMSA tag on it's + * VMSA page. + */ + vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id); + snp_cleanup_vmsa(vmsa, apic_id); + } + + put_cpu(); +} + void snp_kexec_finish(void) { struct sev_es_runtime_data *data; + unsigned long size, addr; unsigned int level, cpu; - unsigned long size; struct ghcb *ghcb; pte_t *pte; @@ -1069,6 +813,8 @@ void snp_kexec_finish(void) if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; + shutdown_all_aps(); + unshare_all_memory(); /* @@ -1085,54 +831,11 @@ void snp_kexec_finish(void) ghcb = &data->ghcb_page; pte = lookup_address((unsigned long)ghcb, &level); size = page_level_size(level); - set_pte_enc(pte, level, (void *)ghcb); - snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE)); - } -} - -static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa) -{ - int ret; - - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - - local_irq_save(flags); - - call.caa = this_cpu_read(svsm_caa); - call.rcx = __pa(va); - - if (make_vmsa) { - /* Protocol 0, Call ID 2 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU); - call.rdx = __pa(caa); - call.r8 = apic_id; - } else { - /* Protocol 0, Call ID 3 */ - call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU); - } - - ret = svsm_perform_call_protocol(&call); - - local_irq_restore(flags); - } else { - /* - * If the kernel runs at VMPL0, it can change the VMSA - * bit for a page using the RMPADJUST instruction. - * However, for the instruction to succeed it must - * target the permissions of a lesser privileged (higher - * numbered) VMPL level, so use VMPL1. - */ - u64 attrs = 1; - - if (make_vmsa) - attrs |= RMPADJUST_VMSA_PAGE_BIT; - - ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); + /* Handle the case of a huge page containing the GHCB page */ + addr = (unsigned long)ghcb & page_level_mask(level); + set_pte_enc(pte, level, (void *)addr); + snp_set_memory_private(addr, (size / PAGE_SIZE)); } - - return ret; } #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) @@ -1166,24 +869,10 @@ static void *snp_alloc_vmsa_page(int cpu) return page_address(p + 1); } -static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id) -{ - int err; - - err = snp_set_vmsa(vmsa, NULL, apic_id, false); - if (err) - pr_err("clear VMSA page failed (%u), leaking page\n", err); - else - free_page((unsigned long)vmsa); -} - static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) { struct sev_es_save_area *cur_vmsa, *vmsa; - struct ghcb_state state; struct svsm_ca *caa; - unsigned long flags; - struct ghcb *ghcb; u8 sipi_vector; int cpu, ret; u64 cr4; @@ -1297,33 +986,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) } /* Issue VMGEXIT AP Creation NAE event */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_rax(ghcb, vmsa->sev_features); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); - ghcb_set_sw_exit_info_1(ghcb, - ((u64)apic_id << 32) | - ((u64)snp_vmpl << 16) | - SVM_VMGEXIT_AP_CREATE); - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || - lower_32_bits(ghcb->save.sw_exit_info_1)) { - pr_err("SNP AP Creation error\n"); - ret = -EINVAL; - } - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Perform cleanup if there was an error */ + ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id); if (ret) { snp_cleanup_vmsa(vmsa, apic_id); vmsa = NULL; @@ -1417,90 +1080,6 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) return 0; } -/* Writes to the SVSM CAA MSR are ignored */ -static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) -{ - if (write) - return ES_OK; - - regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); - regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); - - return ES_OK; -} - -/* - * TSC related accesses should not exit to the hypervisor when a guest is - * executing with Secure TSC enabled, so special handling is required for - * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. - */ -static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) -{ - u64 tsc; - - /* - * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. - * Terminate the SNP guest when the interception is enabled. - */ - if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) - return ES_VMM_ERROR; - - /* - * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC - * to return undefined values, so ignore all writes. - * - * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use - * the value returned by rdtsc_ordered(). - */ - if (write) { - WARN_ONCE(1, "TSC MSR writes are verboten!\n"); - return ES_OK; - } - - tsc = rdtsc_ordered(); - regs->ax = lower_32_bits(tsc); - regs->dx = upper_32_bits(tsc); - - return ES_OK; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; - - switch (regs->cx) { - case MSR_SVSM_CAA: - return __vc_handle_msr_caa(regs, write); - case MSR_IA32_TSC: - case MSR_AMD64_GUEST_TSC_FREQ: - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) - return __vc_handle_secure_tsc_msrs(regs, write); - break; - default: - break; - } - - ghcb_set_rcx(ghcb, regs->cx); - if (write) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0); - - if ((ret == ES_OK) && !write) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; @@ -1713,748 +1292,6 @@ void __init sev_es_init_vc_handling(void) initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; } -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - enum insn_mmio_type mmio; - unsigned int bytes = 0; - enum es_result ret; - u8 sign_byte; - long *reg_data; - - mmio = insn_decode_mmio(insn, &bytes); - if (mmio == INSN_MMIO_DECODE_FAILED) - return ES_DECODE_FAILED; - - if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { - reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); - if (!reg_data) - return ES_DECODE_FAILED; - } - - if (user_mode(ctxt->regs)) - return ES_UNSUPPORTED; - - switch (mmio) { - case INSN_MMIO_WRITE: - memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_WRITE_IMM: - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_READ: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_ZERO_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - memset(reg_data, 0, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_SIGN_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - - /* Sign extend based on operand size */ - memset(reg_data, sign_byte, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_MOVS: - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - default: - ret = ES_UNSUPPORTED; - break; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); - - if (result != ES_OK) - return result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline bool is_vc2_stack(unsigned long sp) -{ - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) -{ - unsigned long sp, prev_sp; - - sp = (unsigned long)regs; - prev_sp = regs->sp; - - /* - * If the code was already executing on the VC2 stack when the #VC - * happened, let it proceed to the normal handling routine. This way the - * code executing on the VC2 stack can cause #VC exceptions to get handled. - */ - return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); -} - -static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - bool ret = true; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - __sev_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - - return ret; -} - -static __always_inline bool vc_is_db(unsigned long error_code) -{ - return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; -} - -/* - * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode - * and will panic when an error happens. - */ -DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) -{ - irqentry_state_t irq_state; - - /* - * With the current implementation it is always possible to switch to a - * safe stack because #VC exceptions only happen at known places, like - * intercepted instructions or accesses to MMIO areas/IO ports. They can - * also happen with code instrumentation when the hypervisor intercepts - * #DB, but the critical paths are forbidden to be instrumented, so #DB - * exceptions currently also only happen in safe places. - * - * But keep this here in case the noinstr annotations are violated due - * to bug elsewhere. - */ - if (unlikely(vc_from_invalid_context(regs))) { - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); - } - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - exc_debug(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); -} - -/* - * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode - * and will kill the current task with SIGBUS when an error happens. - */ -DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) -{ - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - noist_exc_debug(regs); - return; - } - - irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } - - instrumentation_end(); - irqentry_exit_to_user_mode(regs); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -/* - * Initial set up of SNP relies on information provided by the - * Confidential Computing blob, which can be passed to the kernel - * in the following ways, depending on how it is booted: - * - * - when booted via the boot/decompress kernel: - * - via boot_params - * - * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): - * - via a setup_data entry, as defined by the Linux Boot Protocol - * - * Scan for the blob in that order. - */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - /* Boot kernel would have passed the CC blob via boot_params. */ - if (bp->cc_blob_address) { - cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; - goto found_cc_info; - } - - /* - * If kernel was booted directly, without the use of the - * boot/decompression kernel, the CC blob may have been passed via - * setup_data instead. - */ - cc_info = find_cc_blob_setup_data(bp); - if (!cc_info) - return NULL; - -found_cc_info: - if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); - - return cc_info; -} - -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) -{ - struct svsm_call call = {}; - int ret; - u64 pa; - - /* - * Record the SVSM Calling Area address (CAA) if the guest is not - * running at VMPL0. The CA will be used to communicate with the - * SVSM to perform the SVSM services. - */ - if (!svsm_setup_ca(cc_info)) - return; - - /* - * It is very early in the boot and the kernel is running identity - * mapped but without having adjusted the pagetables to where the - * kernel was loaded (physbase), so the get the CA address using - * RIP-relative addressing. - */ - pa = (u64)&RIP_REL_REF(boot_svsm_ca_page); - - /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. - * - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - - RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; - RIP_REL_REF(boot_svsm_caa_pa) = pa; -} - -bool __head snp_init(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - if (!bp) - return false; - - cc_info = find_cc_blob(bp); - if (!cc_info) - return false; - - if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) - secrets_pa = cc_info->secrets_phys; - else - return false; - - setup_cpuid_table(cc_info); - - svsm_setup(cc_info); - - /* - * The CC blob will be used later to access the secrets page. Cache - * it here like the boot kernel does. - */ - bp->cc_blob_address = (u32)(unsigned long)cc_info; - - return true; -} - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} - /* * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are * enabled, as the alternative (fallback) logic for DMI probing in the legacy @@ -2835,7 +1672,7 @@ struct snp_msg_desc *snp_msg_alloc(void) if (!mdesc) return ERR_PTR(-ENOMEM); - mem = ioremap_encrypted(secrets_pa, PAGE_SIZE); + mem = ioremap_encrypted(sev_secrets_pa, PAGE_SIZE); if (!mem) goto e_free_mdesc; @@ -3278,7 +2115,7 @@ void __init snp_secure_tsc_init(void) return; setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - rdmsrl(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz); + rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz); snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000); x86_platform.calibrate_cpu = securetsc_get_tsc_khz; diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/sev-nmi.c new file mode 100644 index 000000000000..d8dfaddfb367 --- /dev/null +++ b/arch/x86/coco/sev/sev-nmi.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/bug.h> +#include <linux/kernel.h> + +#include <asm/cpu_entry_area.h> +#include <asm/msr.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + __sev_put_ghcb(&state); +} diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c new file mode 100644 index 000000000000..0989d98da130 --- /dev/null +++ b/arch/x86/coco/sev/vc-handle.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/sched/debug.h> /* For show_regs() */ +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> + +#include <asm/init.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid/api.h> + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + +void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int insn_bytes; + + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes == 0) { + /* Nothing could be copied */ + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + + memcpy(&d1, buf, 1); + if (__put_user(d1, target)) + goto fault; + break; + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + + memcpy(&d2, buf, 2); + if (__put_user(d2, target)) + goto fault; + break; + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + + memcpy(&d4, buf, 4); + if (__put_user(d4, target)) + goto fault; + break; + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + + memcpy(&d8, buf, 8); + if (__put_user(d8, target)) + goto fault; + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + + if (__get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + + if (__get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + + if (__get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; + if (__get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) + +#include "vc-shared.c" + +/* Writes to the SVSM CAA MSR are ignored */ +static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) +{ + if (write) + return ES_OK; + + regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa)); + regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa)); + + return ES_OK; +} + +/* + * TSC related accesses should not exit to the hypervisor when a guest is + * executing with Secure TSC enabled, so special handling is required for + * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. + */ +static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) +{ + u64 tsc; + + /* + * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. + * Terminate the SNP guest when the interception is enabled. + */ + if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) + return ES_VMM_ERROR; + + /* + * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC + * to return undefined values, so ignore all writes. + * + * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use + * the value returned by rdtsc_ordered(). + */ + if (write) { + WARN_ONCE(1, "TSC MSR writes are verboten!\n"); + return ES_OK; + } + + tsc = rdtsc_ordered(); + regs->ax = lower_32_bits(tsc); + regs->dx = upper_32_bits(tsc); + + return ES_OK; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + bool write; + + /* Is it a WRMSR? */ + write = ctxt->insn.opcode.bytes[1] == 0x30; + + switch (regs->cx) { + case MSR_SVSM_CAA: + return __vc_handle_msr_caa(regs, write); + case MSR_IA32_TSC: + case MSR_AMD64_GUEST_TSC_FREQ: + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return __vc_handle_secure_tsc_msrs(regs, write); + break; + default: + break; + } + + ghcb_set_rcx(ghcb, regs->cx); + if (write) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, write, 0); + + if ((ret == ES_OK) && !write) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + enum insn_mmio_type mmio; + unsigned int bytes = 0; + enum es_result ret; + u8 sign_byte; + long *reg_data; + + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == INSN_MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; + + if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); + if (!reg_data) + return ES_DECODE_FAILED; + } + + if (user_mode(ctxt->regs)) + return ES_UNSUPPORTED; + + switch (mmio) { + case INSN_MMIO_WRITE: + memcpy(ghcb->shared_buffer, reg_data, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_WRITE_IMM: + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case INSN_MMIO_READ: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case INSN_MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + default: + ret = ES_UNSUPPORTED; + break; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) + return ES_VMM_ERROR; + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); + + if (result != ES_OK) + return result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline bool is_vc2_stack(unsigned long sp) +{ + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + bool ret = true; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + __sev_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + + return ret; +} + +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} + +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(vc_from_invalid_context(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) +{ + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c new file mode 100644 index 000000000000..2c0ab0fdc060 --- /dev/null +++ b/arch/x86/coco/sev/vc-shared.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0 + +static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; + u8 modrm = ctxt->insn.modrm.value; + + switch (exit_code) { + + case SVM_EXIT_IOIO: + case SVM_EXIT_NPF: + /* handled separately */ + return ES_OK; + + case SVM_EXIT_CPUID: + if (opcode == 0xa20f) + return ES_OK; + break; + + case SVM_EXIT_INVD: + if (opcode == 0x080f) + return ES_OK; + break; + + case SVM_EXIT_MONITOR: + /* MONITOR and MONITORX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) + return ES_OK; + break; + + case SVM_EXIT_MWAIT: + /* MWAIT and MWAITX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) + return ES_OK; + break; + + case SVM_EXIT_MSR: + /* RDMSR */ + if (opcode == 0x320f || + /* WRMSR */ + opcode == 0x300f) + return ES_OK; + break; + + case SVM_EXIT_RDPMC: + if (opcode == 0x330f) + return ES_OK; + break; + + case SVM_EXIT_RDTSC: + if (opcode == 0x310f) + return ES_OK; + break; + + case SVM_EXIT_RDTSCP: + if (opcode == 0x010f && modrm == 0xf9) + return ES_OK; + break; + + case SVM_EXIT_READ_DR7: + if (opcode == 0x210f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_VMMCALL: + if (opcode == 0x010f && modrm == 0xd9) + return ES_OK; + + break; + + case SVM_EXIT_WRITE_DR7: + if (opcode == 0x230f && + X86_MODRM_REG(ctxt->insn.modrm.value) == 7) + return ES_OK; + break; + + case SVM_EXIT_WBINVD: + if (opcode == 0x90f) + return ES_OK; + break; + + default: + break; + } + + sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", + opcode, exit_code, ctxt->regs->ip); + + return ES_UNSUPPORTED; +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + port = ctxt->regs->dx & 0xffff; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + port = (u8)insn->immediate.value & 0xffff; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + port = ctxt->regs->dx & 0xffff; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + port = ctxt->regs->dx & 0xffff; + break; + + default: + return ES_DECODE_FAILED; + } + + *exitinfo |= port << 16; + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + size = 1; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; + } + + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return vc_ioio_check(ctxt, (u16)port, size); +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(ghcb, ctxt, &leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + /* + * The hypervisor should not be intercepting RDTSC/RDTSCP when Secure + * TSC is enabled. A #VC exception will be generated if the RDTSC/RDTSCP + * instructions are being intercepted. If this should occur and Secure + * TSC is enabled, guest execution should be terminated as the guest + * cannot rely on the TSC value provided by the hypervisor. + */ + if (sev_status & MSR_AMD64_SNP_SECURE_TSC) + return ES_VMM_ERROR; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 91801138b10b..7cd2f395f301 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,7 +1,6 @@ CONFIG_WERROR=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_USELIB=y CONFIG_AUDIT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 3d948f10c94c..56cfdc79e2c6 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -4,7 +4,7 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 tristate - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -17,13 +17,11 @@ config CRYPTO_CURVE25519_X86 config CRYPTO_AES_NI_INTEL tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" - depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER - select CRYPTO_SIMD help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM @@ -38,7 +36,7 @@ config CRYPTO_AES_NI_INTEL config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -50,7 +48,7 @@ config CRYPTO_BLOWFISH_X86_64 config CRYPTO_CAMELLIA_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -61,10 +59,9 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 - select CRYPTO_SIMD imply CRYPTO_XTS help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -75,7 +72,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -86,11 +83,10 @@ config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 config CRYPTO_CAST5_AVX_X86_64 tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm @@ -103,11 +99,10 @@ config CRYPTO_CAST5_AVX_X86_64 config CRYPTO_CAST6_AVX_X86_64 tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -121,7 +116,7 @@ config CRYPTO_CAST6_AVX_X86_64 config CRYPTO_DES3_EDE_X86_64 tristate "Ciphers: Triple DES EDE with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -135,10 +130,9 @@ config CRYPTO_DES3_EDE_X86_64 config CRYPTO_SERPENT_SSE2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -151,10 +145,9 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)" - depends on X86 && !64BIT + depends on !64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -167,10 +160,9 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -184,7 +176,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SERPENT_AVX_X86_64 help Length-preserving ciphers: Serpent cipher algorithm @@ -197,9 +189,8 @@ config CRYPTO_SERPENT_AVX2_X86_64 config CRYPTO_SM4_AESNI_AVX_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 help @@ -218,9 +209,8 @@ config CRYPTO_SM4_AESNI_AVX_X86_64 config CRYPTO_SM4_AESNI_AVX2_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 select CRYPTO_SM4_AESNI_AVX_X86_64 @@ -240,7 +230,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64 config CRYPTO_TWOFISH_586 tristate "Ciphers: Twofish (32-bit)" - depends on (X86 || UML_X86) && !64BIT + depends on !64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -251,7 +241,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Ciphers: Twofish" - depends on (X86 || UML_X86) && 64BIT + depends on 64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -262,7 +252,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -277,9 +267,8 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 select CRYPTO_TWOFISH_X86_64_3WAY @@ -295,9 +284,8 @@ config CRYPTO_TWOFISH_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA help @@ -313,9 +301,8 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX2_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -332,9 +319,8 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64 config CRYPTO_ARIA_GFNI_AVX512_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)" - depends on X86 && 64BIT && AS_AVX512 && AS_GFNI + depends on 64BIT && AS_GFNI select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -349,27 +335,10 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64 Processes 64 blocks in parallel. -config CRYPTO_CHACHA20_X86_64 - tristate - depends on X86 && 64BIT - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - default CRYPTO_LIB_CHACHA_INTERNAL - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX2 (Advanced Vector Extensions 2) - - AVX-512VL (Advanced Vector Extensions-512VL) - config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_AEAD - select CRYPTO_SIMD help AEGIS-128 AEAD algorithm @@ -379,7 +348,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -389,7 +358,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "Hash functions: NHPoly1305 (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -397,21 +366,9 @@ config CRYPTO_NHPOLY1305_AVX2 Architecture: x86_64 using: - AVX2 (Advanced Vector Extensions 2) -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on X86 && 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - config CRYPTO_POLYVAL_CLMUL_NI tristate "Hash functions: POLYVAL (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_POLYVAL help POLYVAL hash function for HCTR2 @@ -419,23 +376,9 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_POLY1305_X86_64 - tristate - depends on X86 && 64BIT - select CRYPTO_HASH - select CRYPTO_LIB_POLY1305_GENERIC - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - default CRYPTO_LIB_POLY1305_INTERNAL - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: x86_64 using: - - SSE2 (Streaming SIMD Extensions 2) - - AVX2 (Advanced Vector Extensions 2) - config CRYPTO_SHA1_SSSE3 tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA1 select CRYPTO_HASH help @@ -447,23 +390,9 @@ config CRYPTO_SHA1_SSSE3 - AVX2 (Advanced Vector Extensions 2) - SHA-NI (SHA Extensions New Instructions) -config CRYPTO_SHA256_SSSE3 - tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT - select CRYPTO_SHA256 - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SHA512_SSSE3 tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA512 select CRYPTO_HASH help @@ -476,9 +405,9 @@ config CRYPTO_SHA512_SSSE3 config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 help SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3 @@ -489,7 +418,7 @@ config CRYPTO_SM3_AVX_X86_64 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "Hash functions: GHASH (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CRYPTD help GCM GHASH hash function (NIST SP800-38D) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5d19f41bde58..aa289a9e0153 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -42,10 +42,6 @@ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o -chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o - obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ @@ -56,29 +52,17 @@ aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o -sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o - -obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o -sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o -sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o +sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o @@ -104,10 +88,5 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) - # Disable GCOV in odd or sensitive code GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 26786e15abac..f1b6d40154e3 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -8,7 +8,6 @@ */ #include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -233,21 +232,18 @@ static struct aead_alg crypto_aegis128_aesni_alg = { .chunksize = AEGIS128_BLOCK_SIZE, .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), .cra_priority = 400, - .cra_name = "__aegis128", - .cra_driver_name = "__aegis128-aesni", + .cra_name = "aegis128", + .cra_driver_name = "aegis128-aesni", .cra_module = THIS_MODULE, } }; -static struct simd_aead_alg *simd_alg; - static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM4_1) || @@ -255,13 +251,12 @@ static int __init crypto_aegis128_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, - &simd_alg); + return crypto_register_aead(&crypto_aegis128_aesni_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { - simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); + crypto_unregister_aead(&crypto_aegis128_aesni_alg); } module_init(crypto_aegis128_aesni_module_init); diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S index 1685d8b24b2c..bbbfd80f5a50 100644 --- a/arch/x86/crypto/aes-ctr-avx-x86_64.S +++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S @@ -48,8 +48,7 @@ // using the following sets of CPU features: // - AES-NI && AVX // - VAES && AVX2 -// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 -// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 +// - VAES && AVX512BW && AVX512VL && BMI2 // // See the function definitions at the bottom of the file for more information. @@ -76,7 +75,6 @@ .text // Move a vector between memory and a register. -// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -86,7 +84,6 @@ .endm // Move a vector between registers. -// The registers must be in the first 16 vector registers. .macro _vmovdqa src, dst .if VL < 64 vmovdqa \src, \dst @@ -96,7 +93,7 @@ .endm // Broadcast a 128-bit value from memory to all 128-bit lanes of a vector -// register. The register operand must be in the first 16 vector registers. +// register. .macro _vbroadcast128 src, dst .if VL == 16 vmovdqu \src, \dst @@ -108,7 +105,6 @@ .endm // XOR two vectors together. -// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst .if VL < 64 vpxor \src1, \src2, \dst @@ -199,8 +195,8 @@ // XOR each with the zero-th round key. Also update LE_CTR if !\final. .macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 .if \is_xctr - .if USE_AVX10 - _vmovdqa LE_CTR, AESDATA\i0 + .if USE_AVX512 + vmovdqa64 LE_CTR, AESDATA\i0 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 .else vpxor XCTR_IV, LE_CTR, AESDATA\i0 @@ -208,7 +204,7 @@ .endif vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 - .if USE_AVX10 + .if USE_AVX512 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 .else vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 @@ -481,18 +477,12 @@ .Lxor_tail_partial_vec_0\@: // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked // loads/stores are available; otherwise it's a bit harder... -.if USE_AVX10 - .if VL <= 32 - mov $-1, %eax - bzhi LEN, %eax, %eax - kmovd %eax, %k1 - .else +.if USE_AVX512 mov $-1, %rax bzhi LEN64, %rax, %rax kmovq %rax, %k1 - .endif vmovdqu8 (SRC), AESDATA1{%k1}{z} - _vpxor AESDATA1, AESDATA0, AESDATA0 + vpxord AESDATA1, AESDATA0, AESDATA0 vmovdqu8 AESDATA0, (DST){%k1} .else .if VL == 32 @@ -554,7 +544,7 @@ // eliminates carries. |ctr| is the per-message block counter starting at 1. .set VL, 16 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) _aes_ctr_crypt 0 SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) @@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx) #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) .set VL, 32 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) _aes_ctr_crypt 0 SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) @@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) _aes_ctr_crypt 1 SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) -.set VL, 32 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) - _aes_ctr_crypt 0 -SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) -SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) - _aes_ctr_crypt 1 -SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) - .set VL, 64 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) _aes_ctr_crypt 0 -SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) -SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) _aes_ctr_crypt 1 -SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) +SYM_FUNC_END(aes_xctr_crypt_vaes_avx512) #endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 93ba0ddbe009..db79cdf81588 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -52,32 +52,25 @@ * different code, it uses a macro to generate several implementations that * share similar source code but are targeted at different CPUs, listed below: * - * AES-NI + AVX + * AES-NI && AVX * - 128-bit vectors (1 AES block per vector) * - VEX-coded instructions * - xmm0-xmm15 * - This is for older CPUs that lack VAES but do have AVX. * - * VAES + VPCLMULQDQ + AVX2 + * VAES && VPCLMULQDQ && AVX2 * - 256-bit vectors (2 AES blocks per vector) * - VEX-coded instructions * - ymm0-ymm15 - * - This is for CPUs that have VAES but lack AVX512 or AVX10, - * e.g. Intel's Alder Lake and AMD's Zen 3. + * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's + * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm + * registers (e.g. Intel's Ice Lake). * - * VAES + VPCLMULQDQ + AVX10/256 + BMI2 - * - 256-bit vectors (2 AES blocks per vector) + * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 + * - 512-bit vectors (4 AES blocks per vector) * - EVEX-coded instructions - * - ymm0-ymm31 - * - This is for CPUs that have AVX512 but where using zmm registers causes - * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. - * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. - * To avoid confusion with 512-bit, we just write AVX10/256. - * - * VAES + VPCLMULQDQ + AVX10/512 + BMI2 - * - Same as the previous one, but upgrades to 512-bit vectors - * (4 AES blocks per vector) in zmm0-zmm31. - * - This is for CPUs that have good AVX512 or AVX10/512 support. + * - zmm0-zmm31 + * - This is for CPUs that have good AVX512 support. * * This file doesn't have an implementation for AES-NI alone (without AVX), as * the lack of VEX would make all the assembly code different. @@ -107,9 +100,20 @@ // exists when there's a carry out of the low 64 bits of the tweak. .quad 0x87, 1 + // These are the shift amounts that are needed when multiplying by [x^0, + // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64. + // + // The right shifts by 64 are expected to zeroize the destination. + // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the + // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would. +.Lrshift_amounts: + .byte 64, 64, 63, 63, 62, 62, 61, 61 +.Llshift_amounts: + .byte 0, 0, 1, 1, 2, 2, 3, 3 + // This table contains constants for vpshufb and vpblendvb, used to // handle variable byte shifts and blending during ciphertext stealing - // on CPUs that don't support AVX10-style masking. + // on CPUs that don't support AVX512-style masking. .Lcts_permute_table: .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 @@ -138,7 +142,7 @@ .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 _define_Vi \i .endr -.if USE_AVX10 +.if USE_AVX512 .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 _define_Vi \i .endr @@ -193,7 +197,7 @@ // keys to the *end* of this register range. I.e., AES-128 uses // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. // (All also use KEY0 for the XOR-only "round" at the beginning.) -.if USE_AVX10 +.if USE_AVX512 .set KEY1_XMM, %xmm16 .set KEY1, V16 .set KEY2_XMM, %xmm17 @@ -227,7 +231,6 @@ .endm // Move a vector between memory and a register. -// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -238,9 +241,9 @@ // Broadcast a 128-bit value into a vector. .macro _vbroadcast128 src, dst -.if VL == 16 && !USE_AVX10 +.if VL == 16 vmovdqu \src, \dst -.elseif VL == 32 && !USE_AVX10 +.elseif VL == 32 vbroadcasti128 \src, \dst .else vbroadcasti32x4 \src, \dst @@ -248,7 +251,6 @@ .endm // XOR two vectors together. -// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst .if VL < 64 vpxor \src1, \src2, \dst @@ -259,7 +261,7 @@ // XOR three vectors together. .macro _xor3 src1, src2, src3_and_dst -.if USE_AVX10 +.if USE_AVX512 // vpternlogd with immediate 0x96 is a three-argument XOR. vpternlogd $0x96, \src1, \src2, \src3_and_dst .else @@ -274,7 +276,7 @@ vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp -.if USE_AVX10 +.if USE_AVX512 vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst .else vpand GF_POLY_XMM, \tmp, \tmp @@ -303,52 +305,75 @@ // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. .macro _compute_first_set_of_tweaks - vmovdqu (TWEAK), TWEAK0_XMM - _vbroadcast128 .Lgf_poly(%rip), GF_POLY .if VL == 16 - // With VL=16, multiplying by x serially is fastest. + vmovdqu (TWEAK), TWEAK0_XMM + vmovdqu .Lgf_poly(%rip), GF_POLY _next_tweak TWEAK0, %xmm0, TWEAK1 _next_tweak TWEAK1, %xmm0, TWEAK2 _next_tweak TWEAK2, %xmm0, TWEAK3 -.else -.if VL == 32 - // Compute the second block of TWEAK0. +.elseif VL == 32 + vmovdqu (TWEAK), TWEAK0_XMM + vbroadcasti128 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks. _next_tweak TWEAK0_XMM, %xmm0, %xmm1 vinserti128 $1, %xmm1, TWEAK0, TWEAK0 -.elseif VL == 64 - // Compute the remaining blocks of TWEAK0. - _next_tweak TWEAK0_XMM, %xmm0, %xmm1 - _next_tweak %xmm1, %xmm0, %xmm2 - _next_tweak %xmm2, %xmm0, %xmm3 - vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 - vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 - vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 -.endif - // Compute TWEAK[1-3] from TWEAK0. - vpsrlq $64 - 1*VL/16, TWEAK0, V0 - vpsrlq $64 - 2*VL/16, TWEAK0, V2 - vpsrlq $64 - 3*VL/16, TWEAK0, V4 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^2, x^2] + // TWEAK2 = TWEAK0 * [x^4, x^4] + // TWEAK3 = TWEAK0 * [x^6, x^6] + vpsrlq $64 - 2, TWEAK0, V0 + vpsrlq $64 - 4, TWEAK0, V2 + vpsrlq $64 - 6, TWEAK0, V4 vpclmulqdq $0x01, GF_POLY, V0, V1 vpclmulqdq $0x01, GF_POLY, V2, V3 vpclmulqdq $0x01, GF_POLY, V4, V5 vpslldq $8, V0, V0 vpslldq $8, V2, V2 vpslldq $8, V4, V4 - vpsllq $1*VL/16, TWEAK0, TWEAK1 - vpsllq $2*VL/16, TWEAK0, TWEAK2 - vpsllq $3*VL/16, TWEAK0, TWEAK3 -.if USE_AVX10 - vpternlogd $0x96, V0, V1, TWEAK1 - vpternlogd $0x96, V2, V3, TWEAK2 - vpternlogd $0x96, V4, V5, TWEAK3 -.else + vpsllq $2, TWEAK0, TWEAK1 + vpsllq $4, TWEAK0, TWEAK2 + vpsllq $6, TWEAK0, TWEAK3 vpxor V0, TWEAK1, TWEAK1 vpxor V2, TWEAK2, TWEAK2 vpxor V4, TWEAK3, TWEAK3 vpxor V1, TWEAK1, TWEAK1 vpxor V3, TWEAK2, TWEAK2 vpxor V5, TWEAK3, TWEAK3 -.endif +.else + vbroadcasti32x4 (TWEAK), TWEAK0 + vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks: + // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3] + vpmovzxbq .Lrshift_amounts(%rip), V4 + vpsrlvq V4, TWEAK0, V0 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpmovzxbq .Llshift_amounts(%rip), V4 + vpslldq $8, V0, V0 + vpsllvq V4, TWEAK0, TWEAK0 + vpternlogd $0x96, V0, V1, TWEAK0 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4] + // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8] + // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12] + // x^8 only needs byte-aligned shifts, so optimize accordingly. + vpsrlq $64 - 4, TWEAK0, V0 + vpsrldq $(64 - 8) / 8, TWEAK0, V2 + vpsrlq $64 - 12, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V4, V4 + vpsllq $4, TWEAK0, TWEAK1 + vpslldq $8 / 8, TWEAK0, TWEAK2 + vpsllq $12, TWEAK0, TWEAK3 + vpternlogd $0x96, V0, V1, TWEAK1 + vpxord V3, TWEAK2, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 .endif .endm @@ -474,26 +499,26 @@ lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. -.if USE_AVX10 +.if USE_AVX512 cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vbroadcast128 -6*16(KEY), KEY1 - _vbroadcast128 -5*16(KEY), KEY2 + vbroadcasti32x4 -6*16(KEY), KEY1 + vbroadcasti32x4 -5*16(KEY), KEY2 .Laes192\@: - _vbroadcast128 -4*16(KEY), KEY3 - _vbroadcast128 -3*16(KEY), KEY4 + vbroadcasti32x4 -4*16(KEY), KEY3 + vbroadcasti32x4 -3*16(KEY), KEY4 .Laes128\@: - _vbroadcast128 -2*16(KEY), KEY5 - _vbroadcast128 -1*16(KEY), KEY6 - _vbroadcast128 0*16(KEY), KEY7 - _vbroadcast128 1*16(KEY), KEY8 - _vbroadcast128 2*16(KEY), KEY9 - _vbroadcast128 3*16(KEY), KEY10 - _vbroadcast128 4*16(KEY), KEY11 - _vbroadcast128 5*16(KEY), KEY12 - _vbroadcast128 6*16(KEY), KEY13 - _vbroadcast128 7*16(KEY), KEY14 + vbroadcasti32x4 -2*16(KEY), KEY5 + vbroadcasti32x4 -1*16(KEY), KEY6 + vbroadcasti32x4 0*16(KEY), KEY7 + vbroadcasti32x4 1*16(KEY), KEY8 + vbroadcasti32x4 2*16(KEY), KEY9 + vbroadcasti32x4 3*16(KEY), KEY10 + vbroadcasti32x4 4*16(KEY), KEY11 + vbroadcasti32x4 5*16(KEY), KEY12 + vbroadcasti32x4 6*16(KEY), KEY13 + vbroadcasti32x4 7*16(KEY), KEY14 .endif .endm @@ -521,7 +546,7 @@ // using the same key for all block(s). The round key is loaded from the // appropriate register or memory location for round \i. May clobber \tmp. .macro _vaes_1x enc, i, xmm_suffix, data, tmp -.if USE_AVX10 +.if USE_AVX512 _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix @@ -538,7 +563,7 @@ // appropriate register or memory location for round \i. In addition, does two // steps of the computation of the next set of tweaks. May clobber V4 and V5. .macro _vaes_4x enc, i -.if USE_AVX10 +.if USE_AVX512 _tweak_step (2*(\i-5)) _vaes \enc, KEY\i, V0 _vaes \enc, KEY\i, V1 @@ -574,7 +599,7 @@ .irp i, 5,6,7,8,9,10,11,12,13 _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp .endr -.if USE_AVX10 +.if USE_AVX512 vpxord KEY14\xmm_suffix, \tweak, \tmp .else .ifnb \xmm_suffix @@ -617,11 +642,11 @@ // This is the main loop, en/decrypting 4*VL bytes per iteration. // XOR each source block with its tweak and the zero-th round key. -.if USE_AVX10 - _vmovdqu 0*VL(SRC), V0 - _vmovdqu 1*VL(SRC), V1 - _vmovdqu 2*VL(SRC), V2 - _vmovdqu 3*VL(SRC), V3 +.if USE_AVX512 + vmovdqu8 0*VL(SRC), V0 + vmovdqu8 1*VL(SRC), V1 + vmovdqu8 2*VL(SRC), V2 + vmovdqu8 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -654,7 +679,7 @@ // Reduce latency by doing the XOR before the vaesenclast, utilizing the // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) // (and likewise for vaesdeclast). -.if USE_AVX10 +.if USE_AVX512 _tweak_step 18 _tweak_step 19 vpxord TWEAK0, KEY14, V4 @@ -762,7 +787,7 @@ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif -.if USE_AVX10 +.if USE_AVX512 // Create a mask that has the first LEN bits set. mov $-1, %r9d bzhi LEN, %r9d, %r9d @@ -811,7 +836,7 @@ // u8 iv[AES_BLOCK_SIZE]); // // Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes -// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) .set TWEAK_KEY, %rdi .set IV, %rsi @@ -853,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv) // multiple of 16, then this function updates |tweak| to contain the next tweak. .set VL, 16 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_aesni_avx) @@ -863,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx) #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) .set VL, 32 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) @@ -871,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) _aes_xts_crypt 0 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) -.set VL, 32 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) - _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) - _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) - .set VL, 64 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512) _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_encrypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512) _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_decrypt_vaes_avx512) #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index bc655d794a95..061b1ced93c5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -566,10 +566,9 @@ static struct crypto_alg aesni_cipher_alg = { static struct skcipher_alg aesni_skciphers[] = { { .base = { - .cra_name = "__ecb(aes)", - .cra_driver_name = "__ecb-aes-aesni", + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -581,10 +580,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = ecb_decrypt, }, { .base = { - .cra_name = "__cbc(aes)", - .cra_driver_name = "__cbc-aes-aesni", + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -597,10 +595,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__cts(cbc(aes))", - .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -615,10 +612,9 @@ static struct skcipher_alg aesni_skciphers[] = { #ifdef CONFIG_X86_64 }, { .base = { - .cra_name = "__ctr(aes)", - .cra_driver_name = "__ctr-aes-aesni", + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -633,10 +629,9 @@ static struct skcipher_alg aesni_skciphers[] = { #endif }, { .base = { - .cra_name = "__xts(aes)", - .cra_driver_name = "__xts-aes-aesni", + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", .cra_priority = 401, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = XTS_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -651,9 +646,6 @@ static struct skcipher_alg aesni_skciphers[] = { } }; -static -struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; - #ifdef CONFIG_X86_64 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); @@ -792,10 +784,9 @@ static int xctr_crypt_##suffix(struct skcipher_request *req) \ } \ \ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ - .base.cra_name = "__xts(aes)", \ - .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ + .base.cra_name = "xts(aes)", \ + .base.cra_driver_name = "xts-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = AES_BLOCK_SIZE, \ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -807,10 +798,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .encrypt = xts_encrypt_##suffix, \ .decrypt = xts_decrypt_##suffix, \ }, { \ - .base.cra_name = "__ctr(aes)", \ - .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ + .base.cra_name = "ctr(aes)", \ + .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -822,10 +812,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .encrypt = ctr_crypt_##suffix, \ .decrypt = ctr_crypt_##suffix, \ }, { \ - .base.cra_name = "__xctr(aes)", \ - .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ + .base.cra_name = "xctr(aes)", \ + .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -836,16 +825,12 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .setkey = aesni_skcipher_setkey, \ .encrypt = xctr_crypt_##suffix, \ .decrypt = xctr_crypt_##suffix, \ -}}; \ - \ -static struct simd_skcipher_alg * \ -simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] +}} DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); -DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); -DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800); #endif /* The common part of the x86_64 AES-GCM key struct */ @@ -1499,10 +1484,9 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ - .cra_name = "__gcm(aes)", \ - .cra_driver_name = "__" generic_driver_name, \ + .cra_name = "gcm(aes)", \ + .cra_driver_name = generic_driver_name, \ .cra_priority = (priority), \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ @@ -1516,17 +1500,14 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ - .cra_name = "__rfc4106(gcm(aes))", \ - .cra_driver_name = "__" rfc_driver_name, \ + .cra_name = "rfc4106(gcm(aes))", \ + .cra_driver_name = rfc_driver_name, \ .cra_priority = (priority), \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ -} }; \ - \ -static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ +} } /* aes_gcm_algs_aesni */ DEFINE_GCM_ALGS(aesni, /* no flags */ 0, @@ -1556,14 +1537,12 @@ static int __init register_avx_algs(void) if (!boot_cpu_has(X86_FEATURE_AVX)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, - ARRAY_SIZE(skcipher_algs_aesni_avx), - simd_skcipher_algs_aesni_avx); + err = crypto_register_skciphers(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx)); if (err) return err; - err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, - ARRAY_SIZE(aes_gcm_algs_aesni_avx), - aes_gcm_simdalgs_aesni_avx); + err = crypto_register_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx)); if (err) return err; /* @@ -1579,9 +1558,8 @@ static int __init register_avx_algs(void) !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, - ARRAY_SIZE(skcipher_algs_vaes_avx2), - simd_skcipher_algs_vaes_avx2); + err = crypto_register_skciphers(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err; @@ -1592,76 +1570,52 @@ static int __init register_avx_algs(void) XFEATURE_MASK_AVX512, NULL)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, - ARRAY_SIZE(skcipher_algs_vaes_avx10_256), - simd_skcipher_algs_vaes_avx10_256); - if (err) - return err; - err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), - aes_gcm_simdalgs_vaes_avx10_256); + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); if (err) return err; if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; - for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) - skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) + skcipher_algs_vaes_avx512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; } - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, - ARRAY_SIZE(skcipher_algs_vaes_avx10_512), - simd_skcipher_algs_vaes_avx10_512); + err = crypto_register_skciphers(skcipher_algs_vaes_avx512, + ARRAY_SIZE(skcipher_algs_vaes_avx512)); if (err) return err; - err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), - aes_gcm_simdalgs_vaes_avx10_512); + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); if (err) return err; #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ return 0; } +#define unregister_skciphers(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_skciphers((A), ARRAY_SIZE(A)) +#define unregister_aeads(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_aeads((A), ARRAY_SIZE(A)) + static void unregister_avx_algs(void) { - if (simd_skcipher_algs_aesni_avx[0]) - simd_unregister_skciphers(skcipher_algs_aesni_avx, - ARRAY_SIZE(skcipher_algs_aesni_avx), - simd_skcipher_algs_aesni_avx); - if (aes_gcm_simdalgs_aesni_avx[0]) - simd_unregister_aeads(aes_gcm_algs_aesni_avx, - ARRAY_SIZE(aes_gcm_algs_aesni_avx), - aes_gcm_simdalgs_aesni_avx); + unregister_skciphers(skcipher_algs_aesni_avx); + unregister_aeads(aes_gcm_algs_aesni_avx); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) - if (simd_skcipher_algs_vaes_avx2[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx2, - ARRAY_SIZE(skcipher_algs_vaes_avx2), - simd_skcipher_algs_vaes_avx2); - if (simd_skcipher_algs_vaes_avx10_256[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, - ARRAY_SIZE(skcipher_algs_vaes_avx10_256), - simd_skcipher_algs_vaes_avx10_256); - if (aes_gcm_simdalgs_vaes_avx10_256[0]) - simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), - aes_gcm_simdalgs_vaes_avx10_256); - if (simd_skcipher_algs_vaes_avx10_512[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, - ARRAY_SIZE(skcipher_algs_vaes_avx10_512), - simd_skcipher_algs_vaes_avx10_512); - if (aes_gcm_simdalgs_vaes_avx10_512[0]) - simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), - aes_gcm_simdalgs_vaes_avx10_512); + unregister_skciphers(skcipher_algs_vaes_avx2); + unregister_skciphers(skcipher_algs_vaes_avx512); + unregister_aeads(aes_gcm_algs_vaes_avx10_256); + unregister_aeads(aes_gcm_algs_vaes_avx10_512); #endif } #else /* CONFIG_X86_64 */ static struct aead_alg aes_gcm_algs_aesni[0]; -static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; static int __init register_avx_algs(void) { @@ -1690,15 +1644,13 @@ static int __init aesni_init(void) if (err) return err; - err = simd_register_skciphers_compat(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); + err = crypto_register_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); if (err) goto unregister_skciphers; @@ -1710,12 +1662,11 @@ static int __init aesni_init(void) unregister_avx: unregister_avx_algs(); - simd_unregister_aeads(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); unregister_skciphers: - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_cipher: crypto_unregister_alg(&aesni_cipher_alg); return err; @@ -1723,11 +1674,10 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_alg(&aesni_cipher_alg); unregister_avx_algs(); } diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c index 87a11804fc77..b4bddcd58457 100644 --- a/arch/x86/crypto/aria_aesni_avx2_glue.c +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx2_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx2", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx2_ecb_encrypt, .decrypt = aria_avx2_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx2", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx2_init(void) { const char *feature_name; @@ -233,15 +228,12 @@ static int __init aria_avx2_init(void) aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx2_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx2_init); diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index 4e1516b76669..ab9b38d05332 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -152,10 +151,9 @@ static int aria_avx_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -165,10 +163,9 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx_ecb_encrypt, .decrypt = aria_avx_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -184,8 +181,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx_init(void) { const char *feature_name; @@ -213,15 +208,12 @@ static int __init aria_avx_init(void) aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx_init); diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c index f4a2208d2638..363cbf4399cc 100644 --- a/arch/x86/crypto/aria_gfni_avx512_glue.c +++ b/arch/x86/crypto/aria_gfni_avx512_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx512_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx512", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx512_ecb_encrypt, .decrypt = aria_avx512_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx512", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx512_init(void) { const char *feature_name; @@ -229,15 +224,12 @@ static int __init aria_avx512_init(void) aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way; aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way; - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx512_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx512_init); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index e7e4d64e9577..2d2f4e16537c 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni-avx2", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni-avx2", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { }, }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -118,15 +113,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index c7ccf63e741e..a7d162388142 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { } }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -117,15 +112,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 3976a87f92ad..3aca04d43b34 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -8,7 +8,6 @@ #include <crypto/algapi.h> #include <crypto/cast5.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast5_algs[] = { { - .base.cra_name = "__ecb(cast5)", - .base.cra_driver_name = "__ecb-cast5-avx", + .base.cra_name = "ecb(cast5)", + .base.cra_driver_name = "ecb-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast5_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast5)", - .base.cra_driver_name = "__cbc-cast5-avx", + .base.cra_name = "cbc(cast5)", + .base.cra_driver_name = "cbc-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast5_algs[] = { } }; -static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)]; - static int __init cast5_init(void) { const char *feature_name; @@ -105,15 +100,13 @@ static int __init cast5_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast5_algs, - ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + return crypto_register_skciphers(cast5_algs, + ARRAY_SIZE(cast5_algs)); } static void __exit cast5_exit(void) { - simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } module_init(cast5_init); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 7e2aea372349..c4dd28c30303 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -14,7 +14,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/cast6.h> -#include <crypto/internal/simd.h> #include "ecb_cbc_helpers.h" @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast6_algs[] = { { - .base.cra_name = "__ecb(cast6)", - .base.cra_driver_name = "__ecb-cast6-avx", + .base.cra_name = "ecb(cast6)", + .base.cra_driver_name = "ecb-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast6_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast6)", - .base.cra_driver_name = "__cbc-cast6-avx", + .base.cra_name = "cbc(cast6)", + .base.cra_driver_name = "cbc-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast6_algs[] = { }, }; -static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)]; - static int __init cast6_init(void) { const char *feature_name; @@ -105,15 +100,12 @@ static int __init cast6_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast6_algs, - ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } static void __exit cast6_exit(void) { - simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } module_init(cast6_init); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c deleted file mode 100644 index 8bb74a272879..000000000000 --- a/arch/x86/crypto/chacha_glue.c +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2015 Martin Willi - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); - -asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12]++; - } -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { - hchacha_block_generic(state, stream, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, stream, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_simd_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct skcipher_walk walk; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - chacha_init(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (!static_branch_likely(&chacha_use_simd) || - !crypto_simd_usable()) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - kernel_fpu_begin(); - chacha_dosimd(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - kernel_fpu_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_simd_stream_xor(req, ctx, req->iv); -} - -static int xchacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct chacha_ctx subctx; - u8 real_iv[16]; - - chacha_init(state, ctx->key, req->iv); - - if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { - kernel_fpu_begin(); - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); - kernel_fpu_end(); - } else { - hchacha_block_generic(state, subctx.key, ctx->nrounds); - } - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_simd_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_simd, - .decrypt = chacha_simd, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, -}; - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; -} - -static void __exit chacha_simd_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 99cb983ded9e..c4fbaa82ed7a 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -103,8 +103,8 @@ SYM_FUNC_START(clmul_ghash_mul) SYM_FUNC_END(clmul_ghash_mul) /* - * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const le128 *shash); + * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const le128 *shash); */ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN @@ -127,6 +127,7 @@ SYM_FUNC_START(clmul_ghash_update) pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: + mov %rdx, %rax FRAME_END RET SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index c759ec808bf1..aea5d4d06be7 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -7,41 +7,27 @@ * Author: Huang Ying <ying.huang@intel.com> */ -#include <linux/err.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/crypto.h> -#include <crypto/algapi.h> -#include <crypto/cryptd.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> +#include <crypto/b128ops.h> +#include <crypto/ghash.h> +#include <crypto/internal/hash.h> +#include <crypto/utils.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #include <linux/unaligned.h> -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 +asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash); -void clmul_ghash_mul(char *dst, const le128 *shash); +asmlinkage int clmul_ghash_update(char *dst, const char *src, + unsigned int srclen, const le128 *shash); -void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const le128 *shash); - -struct ghash_async_ctx { - struct cryptd_ahash *cryptd_tfm; -}; - -struct ghash_ctx { +struct x86_ghash_ctx { le128 shash; }; -struct ghash_desc_ctx { - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; -}; - static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); @@ -54,7 +40,7 @@ static int ghash_init(struct shash_desc *desc) static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm); u64 a, b; if (keylen != GHASH_BLOCK_SIZE) @@ -95,64 +81,38 @@ static int ghash_setkey(struct crypto_shash *tfm, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; + int remain; kernel_fpu_begin(); - if (dctx->bytes) { - int n = min(srclen, dctx->bytes); - u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - - if (!dctx->bytes) - clmul_ghash_mul(dst, &ctx->shash); - } - - clmul_ghash_update(dst, src, srclen, &ctx->shash); + remain = clmul_ghash_update(dst, src, srclen, &ctx->shash); kernel_fpu_end(); - - if (srclen & 0xf) { - src += srclen - (srclen & 0xf); - srclen &= 0xf; - dctx->bytes = GHASH_BLOCK_SIZE - srclen; - while (srclen--) - *dst++ ^= *src++; - } - - return 0; + return remain; } -static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx, + const u8 *src, unsigned int len) { u8 *dst = dctx->buffer; - if (dctx->bytes) { - u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - while (dctx->bytes--) - *tmp++ ^= 0; - - kernel_fpu_begin(); + kernel_fpu_begin(); + if (len) { + crypto_xor(dst, src, len); clmul_ghash_mul(dst, &ctx->shash); - kernel_fpu_end(); } - - dctx->bytes = 0; + kernel_fpu_end(); } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; - ghash_flush(ctx, dctx); + ghash_flush(ctx, dctx, src, len); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; @@ -162,186 +122,20 @@ static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { - .cra_name = "__ghash", - .cra_driver_name = "__ghash-pclmulqdqni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_name = "ghash", + .cra_driver_name = "ghash-pclmulqdqni", + .cra_priority = 400, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_ctxsize = sizeof(struct x86_ghash_ctx), .cra_module = THIS_MODULE, }, }; -static int ghash_async_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return crypto_shash_init(desc); -} - -static void ghash_init_cryptd_req(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - ahash_request_set_callback(cryptd_req, req->base.flags, - req->base.complete, req->base.data); - ahash_request_set_crypt(cryptd_req, req->src, req->result, - req->nbytes); -} - -static int ghash_async_update(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_update(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return shash_ahash_update(req, desc); - } -} - -static int ghash_async_final(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_final(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return crypto_shash_final(desc, req->result); - } -} - -static int ghash_async_import(struct ahash_request *req, const void *in) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - ghash_async_init(req); - memcpy(dctx, in, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_export(struct ahash_request *req, void *out) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - memcpy(out, dctx, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_digest(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_digest(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return shash_ahash_digest(req, desc); - } -} - -static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, - unsigned int keylen) -{ - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct crypto_ahash *child = &ctx->cryptd_tfm->base; - - crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - return crypto_ahash_setkey(child, key, keylen); -} - -static int ghash_async_init_tfm(struct crypto_tfm *tfm) -{ - struct cryptd_ahash *cryptd_tfm; - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ctx->cryptd_tfm = cryptd_tfm; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(&cryptd_tfm->base)); - - return 0; -} - -static void ghash_async_exit_tfm(struct crypto_tfm *tfm) -{ - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ahash(ctx->cryptd_tfm); -} - -static struct ahash_alg ghash_async_alg = { - .init = ghash_async_init, - .update = ghash_async_update, - .final = ghash_async_final, - .setkey = ghash_async_setkey, - .digest = ghash_async_digest, - .export = ghash_async_export, - .import = ghash_async_import, - .halg = { - .digestsize = GHASH_DIGEST_SIZE, - .statesize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-clmulni", - .cra_priority = 400, - .cra_ctxsize = sizeof(struct ghash_async_ctx), - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_module = THIS_MODULE, - .cra_init = ghash_async_init_tfm, - .cra_exit = ghash_async_exit_tfm, - }, - }, -}; - static const struct x86_cpu_id pcmul_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ {} @@ -350,29 +144,14 @@ MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); static int __init ghash_pclmulqdqni_mod_init(void) { - int err; - if (!x86_match_cpu(pcmul_cpu_id)) return -ENODEV; - err = crypto_register_shash(&ghash_alg); - if (err) - goto err_out; - err = crypto_register_ahash(&ghash_async_alg); - if (err) - goto err_shash; - - return 0; - -err_shash: - crypto_unregister_shash(&ghash_alg); -err_out: - return err; + return crypto_register_shash(&ghash_alg); } static void __exit ghash_pclmulqdqni_mod_exit(void) { - crypto_unregister_ahash(&ghash_async_alg); crypto_unregister_shash(&ghash_alg); } diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c deleted file mode 100644 index 08ff4b489f7e..000000000000 --- a/arch/x86/crypto/poly1305_glue.c +++ /dev/null @@ -1,290 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/algapi.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/crypto.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_BLOCK_SIZE]); -asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit - * the unfortunate situation of using AVX and then having to go back to scalar - * -- because the user is silly and has called the update function from two - * separate contexts -- then we need to convert back to the original base before - * proceeding. It is possible to reason that the initial reduction below is - * sufficient given the implementation invariants. However, for an avoidance of - * doubt and because this is not performance critical, we do the full reduction - * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py - */ -static void convert_to_base2_64(void *ctx) -{ - struct poly1305_arch_internal *state = ctx; - u32 cy; - - if (!state->is_base2_26) - return; - - cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; - cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; - cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; - cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; - state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; - state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); - state->hs[2] = state->h[4] >> 24; -#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) - cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); - state->hs[2] &= 3; - state->hs[0] += cy; - state->hs[1] += (cy = ULT(state->hs[0], cy)); - state->hs[2] += ULT(state->hs[1], cy); -#undef ULT - state->is_base2_26 = 0; -} - -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) -{ - poly1305_init_x86_64(ctx, key); -} - -static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, - const u32 padbit) -{ - struct poly1305_arch_internal *state = ctx; - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx) || - (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || - !crypto_simd_usable()) { - convert_to_base2_64(ctx); - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const size_t bytes = min_t(size_t, len, SZ_4K); - - kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} - -static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} - -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_simd_init(&dctx->h, key); - dctx->s[0] = get_unaligned_le32(&key[16]); - dctx->s[1] = get_unaligned_le32(&key[20]); - dctx->s[2] = get_unaligned_le32(&key[24]); - dctx->s[3] = get_unaligned_le32(&key[28]); - dctx->buflen = 0; - dctx->sset = true; -} -EXPORT_SYMBOL(poly1305_init_arch); - -static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, - const u8 *inp, unsigned int len) -{ - unsigned int acc = 0; - if (unlikely(!dctx->sset)) { - if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { - poly1305_simd_init(&dctx->h, inp); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - acc += POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(&inp[0]); - dctx->s[1] = get_unaligned_le32(&inp[4]); - dctx->s[2] = get_unaligned_le32(&inp[8]); - dctx->s[3] = get_unaligned_le32(&inp[12]); - acc += POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - } - return acc; -} - -void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, - unsigned int srclen) -{ - unsigned int bytes, used; - - if (unlikely(dctx->buflen)) { - bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - srclen -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); - dctx->buflen = 0; - } - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = round_down(srclen, POLY1305_BLOCK_SIZE); - srclen -= bytes; - used = crypto_poly1305_setdctxkey(dctx, src, bytes); - if (likely(bytes - used)) - poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); - src += bytes; - } - - if (unlikely(srclen)) { - dctx->buflen = srclen; - memcpy(dctx->buf, src, srclen); - } -} -EXPORT_SYMBOL(poly1305_update_arch); - -void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -{ - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } - - poly1305_simd_emit(&dctx->h, dst, dctx->s); - memzero_explicit(dctx, sizeof(*dctx)); -} -EXPORT_SYMBOL(poly1305_final_arch); - -static int crypto_poly1305_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - *dctx = (struct poly1305_desc_ctx){}; - return 0; -} - -static int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - poly1305_update_arch(dctx, src, srclen); - return 0; -} - -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); - return 0; -} - -static struct shash_alg alg = { - .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_poly1305_init, - .update = crypto_poly1305_update, - .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_desc_ctx), - .base = { - .cra_name = "poly1305", - .cra_driver_name = "poly1305-simd", - .cra_priority = 300, - .cra_blocksize = POLY1305_BLOCK_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; -} - -static void __exit poly1305_simd_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) - crypto_unregister_shash(&alg); -} - -module_init(poly1305_simd_mod_init); -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); -MODULE_DESCRIPTION("Poly1305 authenticator"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index 8fa58b0f3cb3..6b466867f91a 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -16,16 +16,15 @@ * operations. */ -#include <crypto/algapi.h> +#include <asm/cpu_device_id.h> +#include <asm/fpu/api.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/polyval.h> -#include <linux/crypto.h> -#include <linux/init.h> +#include <crypto/utils.h> +#include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/string.h> #define POLYVAL_ALIGN 16 #define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) @@ -42,7 +41,6 @@ struct polyval_tfm_ctx { struct polyval_desc_ctx { u8 buffer[POLYVAL_BLOCK_SIZE]; - u32 bytes; }; asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, @@ -57,25 +55,16 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); - } else { - polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, - nblocks, accumulator); - } + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); } static void internal_polyval_mul(u8 *op1, const u8 *op2) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); - } else { - polyval_mul_non4k(op1, op2); - } + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); } static int polyval_x86_setkey(struct crypto_shash *tfm, @@ -112,49 +101,27 @@ static int polyval_x86_update(struct shash_desc *desc, { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - u8 *pos; unsigned int nblocks; - unsigned int n; - - if (dctx->bytes) { - n = min(srclen, dctx->bytes); - pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - if (!dctx->bytes) - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - while (srclen >= POLYVAL_BLOCK_SIZE) { + do { /* Allow rescheduling every 4K bytes. */ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE; - } + } while (srclen >= POLYVAL_BLOCK_SIZE); - if (srclen) { - dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; - pos = dctx->buffer; - while (srclen--) - *pos++ ^= *src++; - } - - return 0; + return srclen; } -static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +static int polyval_x86_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - if (dctx->bytes) { + if (len) { + crypto_xor(dctx->buffer, src, len); internal_polyval_mul(dctx->buffer, tctx->key_powers[NUM_KEY_POWERS-1]); } @@ -168,13 +135,14 @@ static struct shash_alg polyval_alg = { .digestsize = POLYVAL_DIGEST_SIZE, .init = polyval_x86_init, .update = polyval_x86_update, - .final = polyval_x86_final, + .finup = polyval_x86_finup, .setkey = polyval_x86_setkey, .descsize = sizeof(struct polyval_desc_ctx), .base = { .cra_name = "polyval", .cra_driver_name = "polyval-clmulni", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = POLYVAL_BLOCK_SIZE, .cra_ctxsize = POLYVAL_CTX_SIZE, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 347e97f4b713..f5f2121b7956 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -10,7 +10,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -65,10 +64,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -78,10 +76,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -94,8 +91,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_avx2_init(void) { const char *feature_name; @@ -110,15 +105,13 @@ static int __init serpent_avx2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_avx2_fini(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_avx2_init); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 6c248e1ea4ef..e640abc1cb8a 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -71,10 +70,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -84,10 +82,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -100,8 +97,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_init(void) { const char *feature_name; @@ -112,15 +107,13 @@ static int __init serpent_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_init); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index d78f37e9b2cf..80ee17ec21b4 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -18,7 +18,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/b128ops.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-sse2.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-sse2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-sse2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_sse2_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) { @@ -112,15 +107,13 @@ static int __init serpent_sse2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_sse2_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_sse2_init); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index ab8bc54f254d..0a912bfc86c5 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -16,21 +16,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA1_NI X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), @@ -38,14 +34,10 @@ static const struct x86_cpu_id module_cpu_ids[] = { }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); -static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) +static inline int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_block_fn *sha1_xform) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) - return crypto_sha1_update(desc, data, len); + int remain; /* * Make sure struct sha1_state begins directly with the SHA1 @@ -54,22 +46,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); kernel_fpu_begin(); - sha1_base_do_update(desc, data, len, sha1_xform); + remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); kernel_fpu_end(); - return 0; + return remain; } -static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha1_block_fn *sha1_xform) +static inline int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, + sha1_block_fn *sha1_xform) { - if (!crypto_simd_usable()) - return crypto_sha1_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha1_base_do_update(desc, data, len, sha1_xform); - sha1_base_do_finalize(desc, sha1_xform); + sha1_base_do_finup(desc, data, len, sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); @@ -90,23 +78,17 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, - .final = sha1_ssse3_final, .finup = sha1_ssse3_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -140,22 +122,17 @@ static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_avx); } -static int sha1_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx_update, - .final = sha1_avx_final, .finup = sha1_avx_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -200,8 +177,8 @@ static bool avx2_usable(void) return false; } -static void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) +static inline void sha1_apply_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks) { /* Select the optimal transform based on data block size */ if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) @@ -222,22 +199,17 @@ static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); } -static int sha1_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx2_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx2_update, - .final = sha1_avx2_final, .finup = sha1_avx2_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -256,7 +228,6 @@ static void unregister_sha1_avx2(void) crypto_unregister_shash(&sha1_avx2_alg); } -#ifdef CONFIG_AS_SHA1_NI asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, int rounds); @@ -272,22 +243,17 @@ static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_ni_transform); } -static int sha1_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ni_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ni_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ni_update, - .final = sha1_ni_final, .finup = sha1_ni_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ni", .cra_priority = 250, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -306,11 +272,6 @@ static void unregister_sha1_ni(void) crypto_unregister_shash(&sha1_ni_alg); } -#else -static inline int register_sha1_ni(void) { return 0; } -static inline void unregister_sha1_ni(void) { } -#endif - static int __init sha1_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) @@ -360,6 +321,4 @@ MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-ssse3"); MODULE_ALIAS_CRYPTO("sha1-avx"); MODULE_ALIAS_CRYPTO("sha1-avx2"); -#ifdef CONFIG_AS_SHA1_NI MODULE_ALIAS_CRYPTO("sha1-ni"); -#endif diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c deleted file mode 100644 index e04a43d9f7d5..000000000000 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Cryptographic API. - * - * Glue code for the SHA256 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha256_generic.c - * - * Copyright (C) 2013 Intel Corporation. - * - * Author: - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/string.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage void sha256_transform_ssse3(struct sha256_state *state, - const u8 *data, int blocks); - -static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA256_NI - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static int _sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_block_fn *sha256_xform) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) - return crypto_sha256_update(desc, data, len); - - /* - * Make sure struct sha256_state begins directly with the SHA256 - * 256-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - - kernel_fpu_begin(); - sha256_base_do_update(desc, data, len, sha256_xform); - kernel_fpu_end(); - - return 0; -} - -static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha256_block_fn *sha256_xform) -{ - if (!crypto_simd_usable()) - return crypto_sha256_finup(desc, data, len, out); - - kernel_fpu_begin(); - if (len) - sha256_base_do_update(desc, data, len, sha256_xform); - sha256_base_do_finalize(desc, sha256_xform); - kernel_fpu_end(); - - return sha256_base_finish(desc, out); -} - -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_ssse3); -} - -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_ssse3); -} - -/* Add padding and return the message digest. */ -static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ssse3_finup(desc, NULL, 0, out); -} - -static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ssse3_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ssse3_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .digest = sha256_ssse3_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); - return 0; -} - -static void unregister_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); -} - -asmlinkage void sha256_transform_avx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_avx); -} - -static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_avx); -} - -static int sha256_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx_finup(desc, NULL, 0, out); -} - -static int sha256_avx_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .digest = sha256_avx_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx", - .cra_priority = 160, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx", - .cra_priority = 160, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha256_avx(void) -{ - if (avx_usable()) - return crypto_register_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); - return 0; -} - -static void unregister_sha256_avx(void) -{ - if (avx_usable()) - crypto_unregister_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); -} - -asmlinkage void sha256_transform_rorx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_rorx); -} - -static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_rorx); -} - -static int sha256_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx2_finup(desc, NULL, 0, out); -} - -static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx2_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx2_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .digest = sha256_avx2_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx2", - .cra_priority = 170, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx2", - .cra_priority = 170, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static int register_sha256_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); - return 0; -} - -static void unregister_sha256_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); -} - -#ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(struct sha256_state *digest, - const u8 *data, int rounds); - -static int sha256_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_ni_transform); -} - -static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_ni_transform); -} - -static int sha256_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ni_finup(desc, NULL, 0, out); -} - -static int sha256_ni_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ni_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ni_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .digest = sha256_ni_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ni", - .cra_priority = 250, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ni", - .cra_priority = 250, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); - return 0; -} - -static void unregister_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); -} - -#else -static inline int register_sha256_ni(void) { return 0; } -static inline void unregister_sha256_ni(void) { } -#endif - -static int __init sha256_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha256_ssse3()) - goto fail; - - if (register_sha256_avx()) { - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_avx2()) { - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_ni()) { - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha256_ssse3_mod_fini(void) -{ - unregister_sha256_ni(); - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); -} - -module_init(sha256_ssse3_mod_init); -module_exit(sha256_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha256-ssse3"); -MODULE_ALIAS_CRYPTO("sha256-avx"); -MODULE_ALIAS_CRYPTO("sha256-avx2"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha224-ssse3"); -MODULE_ALIAS_CRYPTO("sha224-avx"); -MODULE_ALIAS_CRYPTO("sha224-avx2"); -#ifdef CONFIG_AS_SHA256_NI -MODULE_ALIAS_CRYPTO("sha256-ni"); -MODULE_ALIAS_CRYPTO("sha224-ni"); -#endif diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6d3b85e53d0e..067684c54395 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -27,17 +27,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> +#include <linux/kernel.h> #include <linux/module.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); @@ -45,11 +41,7 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state, static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { - struct sha512_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) - return crypto_sha512_update(desc, data, len); + int remain; /* * Make sure struct sha512_state begins directly with the SHA512 @@ -58,22 +50,17 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); - sha512_base_do_update(desc, data, len, sha512_xform); + remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); kernel_fpu_end(); - return 0; + return remain; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { - if (!crypto_simd_usable()) - return crypto_sha512_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha512_base_do_update(desc, data, len, sha512_xform); - sha512_base_do_finalize(desc, sha512_xform); + sha512_base_do_finup(desc, data, len, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); @@ -91,23 +78,18 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha512_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -115,13 +97,14 @@ static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -167,23 +150,18 @@ static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_avx); } -/* Add padding and return the message digest. */ -static int sha512_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -191,13 +169,14 @@ static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -233,23 +212,18 @@ static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_rorx); } -/* Add padding and return the message digest. */ -static int sha512_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -257,13 +231,14 @@ static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c index 661b6f22ffcd..6e8c42b9dc8e 100644 --- a/arch/x86/crypto/sm3_avx_glue.c +++ b/arch/x86/crypto/sm3_avx_glue.c @@ -10,12 +10,11 @@ #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> -#include <asm/simd.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> asmlinkage void sm3_transform_avx(struct sm3_state *state, const u8 *data, int nblocks); @@ -23,13 +22,7 @@ asmlinkage void sm3_transform_avx(struct sm3_state *state, static int sm3_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) { - sm3_update(sctx, data, len); - return 0; - } + int remain; /* * Make sure struct sm3_state begins directly with the SM3 @@ -38,45 +31,17 @@ static int sm3_avx_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); kernel_fpu_begin(); - sm3_base_do_update(desc, data, len, sm3_transform_avx); + remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return 0; + return remain; } static int sm3_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (len) - sm3_update(sctx, data, len); - - sm3_final(sctx, out); - return 0; - } - kernel_fpu_begin(); - if (len) - sm3_base_do_update(desc, data, len, sm3_transform_avx); - sm3_base_do_finalize(desc, sm3_transform_avx); + sm3_base_do_finup(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return sm3_base_finish(desc, out); -} - -static int sm3_avx_final(struct shash_desc *desc, u8 *out) -{ - if (!crypto_simd_usable()) { - sm3_final(shash_desc_ctx(desc), out); - return 0; - } - - kernel_fpu_begin(); - sm3_base_do_finalize(desc, sm3_transform_avx); - kernel_fpu_end(); - return sm3_base_finish(desc, out); } @@ -84,13 +49,14 @@ static struct shash_alg sm3_avx_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = sm3_avx_update, - .final = sm3_avx_final, .finup = sm3_avx_finup, - .descsize = sizeof(struct sm3_state), + .descsize = SM3_STATE_SIZE, .base = { .cra_name = "sm3", .cra_driver_name = "sm3-avx", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SM3_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c index 1148fd4cd57f..fec0ab7a63dd 100644 --- a/arch/x86/crypto/sm4_aesni_avx2_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -48,10 +47,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -64,10 +62,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -81,10 +78,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -100,9 +96,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -121,16 +114,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + return crypto_register_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index 85b4ca78b47b..72867fc49ce8 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -263,10 +262,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -279,10 +277,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -296,10 +293,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -315,9 +311,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -335,16 +328,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + return crypto_register_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 3eb3440b477a..9e20db013750 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/twofish.h> #include "twofish.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg twofish_algs[] = { { - .base.cra_name = "__ecb(twofish)", - .base.cra_driver_name = "__ecb-twofish-avx", + .base.cra_name = "ecb(twofish)", + .base.cra_driver_name = "ecb-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg twofish_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(twofish)", - .base.cra_driver_name = "__cbc-twofish-avx", + .base.cra_name = "cbc(twofish)", + .base.cra_driver_name = "cbc-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg twofish_algs[] = { }, }; -static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)]; - static int __init twofish_init(void) { const char *feature_name; @@ -114,15 +109,13 @@ static int __init twofish_init(void) return -ENODEV; } - return simd_register_skciphers_compat(twofish_algs, - ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + return crypto_register_skciphers(twofish_algs, + ARRAY_SIZE(twofish_algs)); } static void __exit twofish_exit(void) { - simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs)); } module_init(twofish_init); diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index d3caa31240ed..175958b02f2b 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -17,19 +17,20 @@ .pushsection .noinstr.text, "ax" -SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx - movl $PRED_CMD_IBPB, %eax + movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx wrmsr /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET -SYM_FUNC_END(entry_ibpb) +SYM_FUNC_END(write_ibpb) /* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +EXPORT_SYMBOL_GPL(write_ibpb); .popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f40bdf97d390..ed04a968cc7d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1525,7 +1525,9 @@ SYM_CODE_END(rewind_stack_and_make_dead) * ORC to unwind properly. * * The alignment is for performance and not for safety, and may be safely - * refactored in the future if needed. + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. */ SYM_FUNC_START(clear_bhb_loop) ANNOTATE_NOENDBR @@ -1536,10 +1538,22 @@ SYM_FUNC_START(clear_bhb_loop) call 1f jmp 5f .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc ANNOTATE_INTRA_FUNCTION_CALL 1: call 2f - RET +.Lret1: RET .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc 2: movl $5, %eax 3: jmp 4f nop @@ -1547,7 +1561,7 @@ SYM_FUNC_START(clear_bhb_loop) jnz 3b sub $1, %ecx jnz 1b - RET +.Lret2: RET 5: lfence pop %rbp RET diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index adb299d3b6a1..afe105b2f907 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -65,7 +65,6 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, static void vdso_fix_landing(const struct vdso_image *image, struct vm_area_struct *new_vma) { -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION if (in_ia32_syscall() && image == &vdso_image_32) { struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; @@ -76,7 +75,6 @@ static void vdso_fix_landing(const struct vdso_image *image, if (regs->ip == old_land_addr) regs->ip = new_vma->vm_start + vdso_land; } -#endif } static int vdso_mremap(const struct vm_special_mapping *sm, @@ -227,7 +225,6 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) return map_vdso(image, addr); } -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ @@ -235,45 +232,38 @@ static int load_vdso32(void) return map_vdso(&vdso_image_32, 0); } -#endif -#ifdef CONFIG_X86_64 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - if (!vdso64_enabled) - return 0; + if (IS_ENABLED(CONFIG_X86_64)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, 0); + } - return map_vdso(&vdso_image_64, 0); + return load_vdso32(); } #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, bool x32) { -#ifdef CONFIG_X86_X32_ABI - if (x32) { + if (IS_ENABLED(CONFIG_X86_X32_ABI) && x32) { if (!vdso64_enabled) return 0; return map_vdso(&vdso_image_x32, 0); } -#endif -#ifdef CONFIG_IA32_EMULATION - return load_vdso32(); -#else + + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + return load_vdso32(); + return 0; -#endif -} -#endif -#else -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - return load_vdso32(); } #endif bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) const struct vdso_image *image = current->mm->context.vdso_image; unsigned long vdso = (unsigned long) current->mm->context.vdso; @@ -282,7 +272,6 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) return true; } -#endif return false; } diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 2fb7d53cf333..c9103a6fa06e 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -341,9 +341,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); -#if CONFIG_PGTABLE_LEVELS >= 5 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); -#endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index ec3427463382..06f35a6b58a5 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -44,12 +44,12 @@ static inline unsigned int brs_to(int idx) static __always_inline void set_debug_extn_cfg(u64 val) { /* bits[4:3] must always be set to 11b */ - __wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32); + native_wrmsrq(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); } static __always_inline u64 get_debug_extn_cfg(void) { - return __rdmsr(MSR_AMD_DBG_EXTN_CFG); + return native_rdmsrq(MSR_AMD_DBG_EXTN_CFG); } static bool __init amd_brs_detect(void) @@ -187,7 +187,7 @@ void amd_brs_reset(void) /* * Mark first entry as poisoned */ - wrmsrl(brs_to(0), BRS_POISON); + wrmsrq(brs_to(0), BRS_POISON); } int __init amd_brs_init(void) @@ -325,7 +325,7 @@ void amd_brs_drain(void) u32 brs_idx = tos - i; u64 from, to; - rdmsrl(brs_to(brs_idx), to); + rdmsrq(brs_to(brs_idx), to); /* Entry does not belong to us (as marked by kernel) */ if (to == BRS_POISON) @@ -341,7 +341,7 @@ void amd_brs_drain(void) if (!amd_brs_match_plm(event, to)) continue; - rdmsrl(brs_from(brs_idx), from); + rdmsrq(brs_from(brs_idx), from); perf_clear_branch_entry_bitfields(br+nr); @@ -371,7 +371,7 @@ static void amd_brs_poison_buffer(void) idx = amd_brs_get_tos(&cfg); /* Poison target of entry */ - wrmsrl(brs_to(idx), BRS_POISON); + wrmsrq(brs_to(idx), BRS_POISON); } /* diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 30d6ceb4c8ad..b20661b8621d 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -9,6 +9,7 @@ #include <linux/jiffies.h> #include <asm/apicdef.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/nmi.h> #include "../perf_event.h" @@ -563,13 +564,13 @@ static void amd_pmu_cpu_reset(int cpu) return; /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); /* * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze * and PerfCntrGLobalStatus.PerfCntrOvfl */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); } @@ -651,7 +652,7 @@ static void amd_pmu_cpu_dead(int cpu) static __always_inline void amd_pmu_set_global_ctl(u64 ctl) { - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } static inline u64 amd_pmu_get_global_status(void) @@ -659,7 +660,7 @@ static inline u64 amd_pmu_get_global_status(void) u64 status; /* PerfCntrGlobalStatus is read-only */ - rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); return status; } @@ -672,14 +673,14 @@ static inline void amd_pmu_ack_global_status(u64 status) * clears the same bit in PerfCntrGlobalStatus */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); } static bool amd_pmu_test_overflow_topbit(int idx) { u64 counter; - rdmsrl(x86_pmu_event_addr(idx), counter); + rdmsrq(x86_pmu_event_addr(idx), counter); return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); } @@ -1003,8 +1004,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } /* diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0252b7ea8bca..112f43b23ebf 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -15,6 +15,7 @@ #include <linux/sched/clock.h> #include <asm/apic.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -26,7 +27,7 @@ static u32 ibs_caps; #include <linux/hardirq.h> #include <asm/nmi.h> -#include <asm/amd-ibs.h> +#include <asm/amd/ibs.h> /* attr.config2 */ #define IBS_SW_FILTER_MASK 1 @@ -424,7 +425,7 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, * prev count manually on overflow. */ while (!perf_event_try_update(event, count, 64)) { - rdmsrl(event->hw.config_base, *config); + rdmsrq(event->hw.config_base, *config); count = perf_ibs->get_count(*config); } } @@ -435,9 +436,9 @@ static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, u64 tmp = hwc->config | config; if (perf_ibs->fetch_count_reset_broken) - wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); + wrmsrq(hwc->config_base, tmp & ~perf_ibs->enable_mask); - wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); + wrmsrq(hwc->config_base, tmp | perf_ibs->enable_mask); } /* @@ -452,9 +453,9 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, { config &= ~perf_ibs->cnt_mask; if (boot_cpu_data.x86 == 0x10) - wrmsrl(hwc->config_base, config); + wrmsrq(hwc->config_base, config); config &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, config); + wrmsrq(hwc->config_base, config); } /* @@ -513,7 +514,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return; - rdmsrl(hwc->config_base, config); + rdmsrq(hwc->config_base, config); if (stopping) { /* @@ -1256,7 +1257,7 @@ fail: hwc = &event->hw; msr = hwc->config_base; buf = ibs_data.regs; - rdmsrl(msr, *buf); + rdmsrq(msr, *buf); if (!(*buf++ & perf_ibs->valid_mask)) goto fail; @@ -1274,7 +1275,7 @@ fail: offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { - rdmsrl(msr + offset, *buf++); + rdmsrq(msr + offset, *buf++); size++; offset = find_next_bit(perf_ibs->offset_mask, perf_ibs->offset_max, @@ -1304,17 +1305,17 @@ fail: if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_BRNTRGT) { - rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + rdmsrq(MSR_AMD64_IBSBRTARGET, *buf++); br_target_idx = size; size++; } if (ibs_caps & IBS_CAPS_OPDATA4) { - rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + rdmsrq(MSR_AMD64_IBSOPDATA4, *buf++); size++; } } if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { - rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); + rdmsrq(MSR_AMD64_ICIBSEXTDCTL, *buf++); size++; } } @@ -1373,9 +1374,7 @@ fail: hwc->sample_period = perf_ibs->min_period; out: - if (throttle) { - perf_ibs_stop(event, 0); - } else { + if (!throttle) { if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_OPCNTEXT) { new_config = period & IBS_OP_MAX_CNT_EXT_MASK; @@ -1565,7 +1564,7 @@ static inline int ibs_eilvt_valid(void) preempt_disable(); - rdmsrl(MSR_AMD64_IBSCTL, val); + rdmsrq(MSR_AMD64_IBSCTL, val); offset = val & IBSCTL_LVT_OFFSET_MASK; if (!(val & IBSCTL_LVT_OFFSET_VALID)) { @@ -1680,7 +1679,7 @@ static inline int get_ibs_lvt_offset(void) { u64 val; - rdmsrl(MSR_AMD64_IBSCTL, val); + rdmsrq(MSR_AMD64_IBSCTL, val); if (!(val & IBSCTL_LVT_OFFSET_VALID)) return -EINVAL; diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index f8228d8243f7..a721da9987dd 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/amd-iommu.h> +#include <asm/msr.h> + #include "../perf_event.h" #include "iommu.h" diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index c06ccca96851..d24da377df77 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include "../perf_event.h" @@ -61,19 +62,19 @@ struct branch_entry { static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) { - wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + wrmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2, val); } static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) { - wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + wrmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); } static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) { u64 val; - rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + rdmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2, val); return val; } @@ -82,7 +83,7 @@ static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) { u64 val; - rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + rdmsrq(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); return val; } @@ -333,7 +334,7 @@ void amd_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; - wrmsrl(MSR_AMD64_LBR_SELECT, 0); + wrmsrq(MSR_AMD64_LBR_SELECT, 0); } void amd_pmu_lbr_add(struct perf_event *event) @@ -396,16 +397,16 @@ void amd_pmu_lbr_enable_all(void) /* Set hardware branch filter */ if (cpuc->lbr_select) { lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; - wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + wrmsrq(MSR_AMD64_LBR_SELECT, lbr_select); } if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + rdmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); } - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); + rdmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + wrmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); } void amd_pmu_lbr_disable_all(void) diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 37d5b380516e..dad42790cf7d 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "../perf_event.h" /* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ @@ -48,8 +49,8 @@ static void event_update(struct perf_event *event) prev_pwr_acc = hwc->pwr_acc; prev_ptsc = hwc->ptsc; - rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc); - rdmsrl(MSR_F15H_PTSC, new_ptsc); + rdmsrq(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc); + rdmsrq(MSR_F15H_PTSC, new_ptsc); /* * Calculate the CU power consumption over a time period, the unit of @@ -75,8 +76,8 @@ static void __pmu_event_start(struct perf_event *event) event->hw.state = 0; - rdmsrl(MSR_F15H_PTSC, event->hw.ptsc); - rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc); + rdmsrq(MSR_F15H_PTSC, event->hw.ptsc); + rdmsrq(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc); } static void pmu_event_start(struct perf_event *event, int mode) @@ -272,7 +273,7 @@ static int __init amd_power_pmu_init(void) cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); - if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) { + if (rdmsrq_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) { pr_err("Failed to read max compute unit power accumulator MSR\n"); return -ENODEV; } diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 49c26ce2b115..e8b6af199c73 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -21,6 +21,7 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 #define NUM_COUNTERS_L3 6 +#define NUM_COUNTERS_MAX 64 #define RDPMC_BASE_NB 6 #define RDPMC_BASE_LLC 10 @@ -38,7 +39,10 @@ struct amd_uncore_ctx { int refcnt; int cpu; struct perf_event **events; - struct hlist_node node; + unsigned long active_mask[BITS_TO_LONGS(NUM_COUNTERS_MAX)]; + int nr_active; + struct hrtimer hrtimer; + u64 hrtimer_duration; }; struct amd_uncore_pmu { @@ -83,11 +87,51 @@ struct amd_uncore { static struct amd_uncore uncores[UNCORE_TYPE_MAX]; +/* Interval for hrtimer, defaults to 60000 milliseconds */ +static unsigned int update_interval = 60 * MSEC_PER_SEC; +module_param(update_interval, uint, 0444); + static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event) { return container_of(event->pmu, struct amd_uncore_pmu, pmu); } +static enum hrtimer_restart amd_uncore_hrtimer(struct hrtimer *hrtimer) +{ + struct amd_uncore_ctx *ctx; + struct perf_event *event; + int bit; + + ctx = container_of(hrtimer, struct amd_uncore_ctx, hrtimer); + + if (!ctx->nr_active || ctx->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + + for_each_set_bit(bit, ctx->active_mask, NUM_COUNTERS_MAX) { + event = ctx->events[bit]; + event->pmu->read(event); + } + + hrtimer_forward_now(hrtimer, ns_to_ktime(ctx->hrtimer_duration)); + return HRTIMER_RESTART; +} + +static void amd_uncore_start_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_start(&ctx->hrtimer, ns_to_ktime(ctx->hrtimer_duration), + HRTIMER_MODE_REL_PINNED_HARD); +} + +static void amd_uncore_cancel_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_cancel(&ctx->hrtimer); +} + +static void amd_uncore_init_hrtimer(struct amd_uncore_ctx *ctx) +{ + hrtimer_setup(&ctx->hrtimer, amd_uncore_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); +} + static void amd_uncore_read(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -106,9 +150,9 @@ static void amd_uncore_read(struct perf_event *event) * read counts directly from the corresponding PERF_CTR. */ if (hwc->event_base_rdpmc < 0) - rdmsrl(hwc->event_base, new); + rdmsrq(hwc->event_base, new); else - rdpmcl(hwc->event_base_rdpmc, new); + new = rdpmc(hwc->event_base_rdpmc); local64_set(&hwc->prev_count, new); delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); @@ -118,27 +162,40 @@ static void amd_uncore_read(struct perf_event *event) static void amd_uncore_start(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; + if (!ctx->nr_active++) + amd_uncore_start_hrtimer(ctx); + if (flags & PERF_EF_RELOAD) - wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + wrmsrq(hwc->event_base, (u64)local64_read(&hwc->prev_count)); hwc->state = 0; - wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); + __set_bit(hwc->idx, ctx->active_mask); + wrmsrq(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); perf_event_update_userpage(event); } static void amd_uncore_stop(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config); + wrmsrq(hwc->config_base, hwc->config); hwc->state |= PERF_HES_STOPPED; if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { event->pmu->read(event); hwc->state |= PERF_HES_UPTODATE; } + + if (!--ctx->nr_active) + amd_uncore_cancel_hrtimer(ctx); + + __clear_bit(hwc->idx, ctx->active_mask); } static int amd_uncore_add(struct perf_event *event, int flags) @@ -491,6 +548,9 @@ static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu) goto fail; } + amd_uncore_init_hrtimer(curr); + curr->hrtimer_duration = (u64)update_interval * NSEC_PER_MSEC; + cpumask_set_cpu(cpu, &pmu->active_mask); } @@ -880,16 +940,55 @@ static int amd_uncore_umc_event_init(struct perf_event *event) static void amd_uncore_umc_start(struct perf_event *event, int flags) { + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); struct hw_perf_event *hwc = &event->hw; + if (!ctx->nr_active++) + amd_uncore_start_hrtimer(ctx); + if (flags & PERF_EF_RELOAD) - wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + wrmsrq(hwc->event_base, (u64)local64_read(&hwc->prev_count)); hwc->state = 0; - wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC)); + __set_bit(hwc->idx, ctx->active_mask); + wrmsrq(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC)); perf_event_update_userpage(event); } +static void amd_uncore_umc_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new, shift; + s64 delta; + + shift = COUNTER_SHIFT + 1; + prev = local64_read(&hwc->prev_count); + + /* + * UMC counters do not have RDPMC assignments. Read counts directly + * from the corresponding PERF_CTR. + */ + rdmsrl(hwc->event_base, new); + + /* + * Unlike the other uncore counters, UMC counters saturate and set the + * Overflow bit (bit 48) on overflow. Since they do not roll over, + * proactively reset the corresponding PERF_CTR when bit 47 is set so + * that the counter never gets a chance to saturate. + */ + if (new & BIT_ULL(63 - COUNTER_SHIFT)) { + wrmsrl(hwc->event_base, 0); + local64_set(&hwc->prev_count, 0); + } else { + local64_set(&hwc->prev_count, new); + } + + delta = (new << shift) - (prev << shift); + delta >>= shift; + local64_add(delta, &event->count); +} + static void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu) { @@ -968,7 +1067,7 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu) .del = amd_uncore_del, .start = amd_uncore_umc_start, .stop = amd_uncore_stop, - .read = amd_uncore_read, + .read = amd_uncore_umc_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, .module = THIS_MODULE, }; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..7610f26dfbd9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -32,6 +32,7 @@ #include <asm/apic.h> #include <asm/stacktrace.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> @@ -95,6 +96,11 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -134,7 +140,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ prev_raw_count = local64_read(&hwc->prev_count); do { - rdpmcl(hwc->event_base_rdpmc, new_raw_count); + new_raw_count = rdpmc(hwc->event_base_rdpmc); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); @@ -269,7 +275,7 @@ bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, */ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); - ret = rdmsrl_safe(reg, &val); + ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { @@ -283,7 +289,7 @@ bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - ret = rdmsrl_safe(reg, &val); + ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { @@ -314,11 +320,11 @@ bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); - if (rdmsrl_safe(reg, &val)) + if (rdmsrq_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; - ret = wrmsrl_safe(reg, val); - ret |= rdmsrl_safe(reg, &val_new); + ret = wrmsrq_safe(reg, val); + ret |= rdmsrq_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -629,7 +635,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (!event->attr.freq && x86_pmu.limit_period) { + if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) @@ -674,6 +680,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; @@ -693,13 +700,13 @@ void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(x86_pmu_config_addr(idx), val); + rdmsrq(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu_config_addr(idx), val); + wrmsrq(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_config_addr(idx + 1), 0); + wrmsrq(x86_pmu_config_addr(idx + 1), 0); } } @@ -754,17 +761,18 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { - int i; - - if (!is_hybrid()) - return event->pmu == &pmu; - - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) - return true; - } + /* + * For a non-hybrid platforms, the type of X86 pmu is + * always PERF_TYPE_RAW. + * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE + * is a unique capability for the X86 PMU. + * Use them to detect a X86 event. + */ + if (event->pmu->type == PERF_TYPE_RAW || + event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) + return true; return false; } @@ -1420,14 +1428,14 @@ int x86_perf_event_set_period(struct perf_event *event) */ local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); + wrmsrq(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); @@ -1550,10 +1558,10 @@ void perf_event_print_debug(void) return; if (x86_pmu.version >= 2) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + rdmsrq(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); @@ -1561,19 +1569,19 @@ void perf_event_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { - rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); + rdmsrq(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { - rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); - rdmsrl(x86_pmu_event_addr(idx), pmc_count); + rdmsrq(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrq(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); @@ -1587,7 +1595,7 @@ void perf_event_print_debug(void) for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); + rdmsrq(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); @@ -1683,6 +1691,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; + u64 last_period; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1702,6 +1711,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) continue; event = cpuc->events[idx]; + last_period = event->hw.last_period; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) @@ -1715,12 +1725,11 @@ int x86_pmu_handle_irq(struct pt_regs *regs) if (!static_call(x86_pmu_set_period)(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } if (handled) @@ -2046,6 +2055,11 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); + + static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); + static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); + static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); + static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); } static void _x86_pmu_read(struct perf_event *event) @@ -2496,9 +2510,9 @@ void perf_clear_dirty_counters(void) if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; - wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); + wrmsrq(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { - wrmsrl(x86_pmu_event_addr(i), 0); + wrmsrq(x86_pmu_event_addr(i), 0); } } @@ -2803,8 +2817,15 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return 0; + /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = READ_ONCE(current->active_mm->context.ldt); + ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index a95e6c91c4d7..61da6b8a3d51 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -17,6 +17,7 @@ #include <linux/sizes.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -80,54 +81,54 @@ static void * bts_buffer_setup_aux(struct perf_event *event, void **pages, int nr_pages, bool overwrite) { - struct bts_buffer *buf; + struct bts_buffer *bb; struct page *page; int cpu = event->cpu; int node = (cpu == -1) ? cpu : cpu_to_node(cpu); unsigned long offset; size_t size = nr_pages << PAGE_SHIFT; - int pg, nbuf, pad; + int pg, nr_buf, pad; /* count all the high order buffers */ - for (pg = 0, nbuf = 0; pg < nr_pages;) { + for (pg = 0, nr_buf = 0; pg < nr_pages;) { page = virt_to_page(pages[pg]); pg += buf_nr_pages(page); - nbuf++; + nr_buf++; } /* * to avoid interrupts in overwrite mode, only allow one physical */ - if (overwrite && nbuf > 1) + if (overwrite && nr_buf > 1) return NULL; - buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); - if (!buf) + bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node); + if (!bb) return NULL; - buf->nr_pages = nr_pages; - buf->nr_bufs = nbuf; - buf->snapshot = overwrite; - buf->data_pages = pages; - buf->real_size = size - size % BTS_RECORD_SIZE; + bb->nr_pages = nr_pages; + bb->nr_bufs = nr_buf; + bb->snapshot = overwrite; + bb->data_pages = pages; + bb->real_size = size - size % BTS_RECORD_SIZE; - for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { + for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) { unsigned int __nr_pages; page = virt_to_page(pages[pg]); __nr_pages = buf_nr_pages(page); - buf->buf[nbuf].page = page; - buf->buf[nbuf].offset = offset; - buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; - pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; - buf->buf[nbuf].size -= pad; + bb->buf[nr_buf].page = page; + bb->buf[nr_buf].offset = offset; + bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); + bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement; + pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE; + bb->buf[nr_buf].size -= pad; pg += __nr_pages; offset += __nr_pages << PAGE_SHIFT; } - return buf; + return bb; } static void bts_buffer_free_aux(void *data) @@ -135,25 +136,25 @@ static void bts_buffer_free_aux(void *data) kfree(data); } -static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) +static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx) { - return buf->buf[idx].offset + buf->buf[idx].displacement; + return bb->buf[idx].offset + bb->buf[idx].displacement; } static void -bts_config_buffer(struct bts_buffer *buf) +bts_config_buffer(struct bts_buffer *bb) { int cpu = raw_smp_processor_id(); struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_phys *phys = &buf->buf[buf->cur_buf]; + struct bts_phys *phys = &bb->buf[bb->cur_buf]; unsigned long index, thresh = 0, end = phys->size; struct page *page = phys->page; - index = local_read(&buf->head); + index = local_read(&bb->head); - if (!buf->snapshot) { - if (buf->end < phys->offset + buf_size(page)) - end = buf->end - phys->offset - phys->displacement; + if (!bb->snapshot) { + if (bb->end < phys->offset + buf_size(page)) + end = bb->end - phys->offset - phys->displacement; index -= phys->offset + phys->displacement; @@ -168,7 +169,7 @@ bts_config_buffer(struct bts_buffer *buf) ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; ds->bts_index = ds->bts_buffer_base + index; ds->bts_absolute_maximum = ds->bts_buffer_base + end; - ds->bts_interrupt_threshold = !buf->snapshot + ds->bts_interrupt_threshold = !bb->snapshot ? ds->bts_buffer_base + thresh : ds->bts_absolute_maximum + BTS_RECORD_SIZE; } @@ -184,16 +185,16 @@ static void bts_update(struct bts_ctx *bts) { int cpu = raw_smp_processor_id(); struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *bb = perf_get_aux(&bts->handle); unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; - if (!buf) + if (!bb) return; - head = index + bts_buffer_offset(buf, buf->cur_buf); - old = local_xchg(&buf->head, head); + head = index + bts_buffer_offset(bb, bb->cur_buf); + old = local_xchg(&bb->head, head); - if (!buf->snapshot) { + if (!bb->snapshot) { if (old == head) return; @@ -205,9 +206,9 @@ static void bts_update(struct bts_ctx *bts) * old and head are always in the same physical buffer, so we * can subtract them to get the data size. */ - local_add(head - old, &buf->data_size); + local_add(head - old, &bb->data_size); } else { - local_set(&buf->data_size, head); + local_set(&bb->data_size, head); } /* @@ -218,7 +219,7 @@ static void bts_update(struct bts_ctx *bts) } static int -bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle); /* * Ordering PMU callbacks wrt themselves and the PMI is done by means @@ -232,17 +233,17 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *bb = perf_get_aux(&bts->handle); u64 config = 0; - if (!buf->snapshot) + if (!bb->snapshot) config |= ARCH_PERFMON_EVENTSEL_INT; if (!event->attr.exclude_kernel) config |= ARCH_PERFMON_EVENTSEL_OS; if (!event->attr.exclude_user) config |= ARCH_PERFMON_EVENTSEL_USR; - bts_config_buffer(buf); + bts_config_buffer(bb); /* * local barrier to make sure that ds configuration made it @@ -261,13 +262,13 @@ static void bts_event_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf; + struct bts_buffer *bb; - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) + bb = perf_aux_output_begin(&bts->handle, event); + if (!bb) goto fail_stop; - if (bts_buffer_reset(buf, &bts->handle)) + if (bts_buffer_reset(bb, &bts->handle)) goto fail_end_stop; bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; @@ -306,27 +307,27 @@ static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(bts_ctx); - struct bts_buffer *buf = NULL; + struct bts_buffer *bb = NULL; int state = READ_ONCE(bts->state); if (state == BTS_STATE_ACTIVE) __bts_event_stop(event, BTS_STATE_STOPPED); if (state != BTS_STATE_STOPPED) - buf = perf_get_aux(&bts->handle); + bb = perf_get_aux(&bts->handle); event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); - if (buf) { - if (buf->snapshot) + if (bb) { + if (bb->snapshot) bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); + local_xchg(&bb->data_size, + bb->nr_pages << PAGE_SHIFT); perf_aux_output_end(&bts->handle, - local_xchg(&buf->data_size, 0)); + local_xchg(&bb->data_size, 0)); } cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; @@ -382,19 +383,19 @@ void intel_bts_disable_local(void) } static int -bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) +bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle) { unsigned long head, space, next_space, pad, gap, skip, wakeup; unsigned int next_buf; struct bts_phys *phys, *next_phys; int ret; - if (buf->snapshot) + if (bb->snapshot) return 0; - head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); + head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1); - phys = &buf->buf[buf->cur_buf]; + phys = &bb->buf[bb->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; pad = space; if (space > handle->size) { @@ -403,10 +404,10 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) } if (space <= BTS_SAFETY_MARGIN) { /* See if next phys buffer has more space */ - next_buf = buf->cur_buf + 1; - if (next_buf >= buf->nr_bufs) + next_buf = bb->cur_buf + 1; + if (next_buf >= bb->nr_bufs) next_buf = 0; - next_phys = &buf->buf[next_buf]; + next_phys = &bb->buf[next_buf]; gap = buf_size(phys->page) - phys->displacement - phys->size + next_phys->displacement; skip = pad + gap; @@ -431,8 +432,8 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) * anymore, so we must not be racing with * bts_update(). */ - buf->cur_buf = next_buf; - local_set(&buf->head, head); + bb->cur_buf = next_buf; + local_set(&bb->head, head); } } } @@ -445,7 +446,7 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) space -= space % BTS_RECORD_SIZE; } - buf->end = head + space; + bb->end = head + space; /* * If we have no space, the lost notification would have been sent when @@ -462,7 +463,7 @@ int intel_bts_interrupt(void) struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts; struct perf_event *event; - struct bts_buffer *buf; + struct bts_buffer *bb; s64 old_head; int err = -ENOSPC, handled = 0; @@ -485,8 +486,8 @@ int intel_bts_interrupt(void) if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) return handled; - buf = perf_get_aux(&bts->handle); - if (!buf) + bb = perf_get_aux(&bts->handle); + if (!bb) return handled; /* @@ -494,26 +495,26 @@ int intel_bts_interrupt(void) * there's no other way of telling, because the pointer will * keep moving */ - if (buf->snapshot) + if (bb->snapshot) return 0; - old_head = local_read(&buf->head); + old_head = local_read(&bb->head); bts_update(bts); /* no new data */ - if (old_head == local_read(&buf->head)) + if (old_head == local_read(&bb->head)) return handled; - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0)); + perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0)); - buf = perf_aux_output_begin(&bts->handle, event); - if (buf) - err = bts_buffer_reset(buf, &bts->handle); + bb = perf_aux_output_begin(&bts->handle, event); + if (bb) + err = bts_buffer_reset(bb, &bts->handle); if (err) { WRITE_ONCE(bts->state, BTS_STATE_STOPPED); - if (buf) { + if (bb) { /* * BTS_STATE_STOPPED should be visible before * cleared handle::event @@ -599,7 +600,11 @@ static void bts_event_read(struct perf_event *event) static __init int bts_init(void) { - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return -ENODEV; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + if (!x86_pmu.bts) return -ENODEV; if (boot_cpu_has(X86_FEATURE_PTI)) { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09d2d66c9f21..466283326630 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -23,6 +23,7 @@ #include <asm/intel_pt.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -2224,6 +2225,18 @@ static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); + +static struct attribute *skt_events_attrs[] = { + EVENT_PTR(td_fe_bound_skt), + EVENT_PTR(td_retiring_skt), + EVENT_PTR(td_bad_spec_cmt), + EVENT_PTR(td_be_bound_skt), + NULL, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2285,7 +2298,7 @@ static __always_inline void __intel_pmu_disable_all(bool bts) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0); if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); @@ -2294,7 +2307,7 @@ static __always_inline void __intel_pmu_disable_all(bool bts) static __always_inline void intel_pmu_disable_all(void) { __intel_pmu_disable_all(true); - intel_pmu_pebs_disable_all(); + static_call_cond(x86_pmu_pebs_disable_all)(); intel_pmu_lbr_disable_all(); } @@ -2306,11 +2319,11 @@ static void __intel_pmu_enable_all(int added, bool pmi) intel_pmu_lbr_enable_all(pmi); if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { - wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + wrmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; } - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { @@ -2326,7 +2339,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) static void intel_pmu_enable_all(int added) { - intel_pmu_pebs_enable_all(); + static_call_cond(x86_pmu_pebs_enable_all)(); __intel_pmu_enable_all(added, false); } @@ -2426,12 +2439,12 @@ static void intel_pmu_nhm_workaround(void) } for (i = 0; i < 4; i++) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); - wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + wrmsrq(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrq(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); } - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); for (i = 0; i < 4; i++) { event = cpuc->events[i]; @@ -2441,7 +2454,7 @@ static void intel_pmu_nhm_workaround(void) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); + wrmsrq(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); } } @@ -2458,7 +2471,7 @@ static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on) if (cpuc->tfa_shadow != val) { cpuc->tfa_shadow = val; - wrmsrl(MSR_TSX_FORCE_ABORT, val); + wrmsrq(MSR_TSX_FORCE_ABORT, val); } } @@ -2489,14 +2502,14 @@ static inline u64 intel_pmu_get_status(void) { u64 status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; } static inline void intel_pmu_ack_status(u64 ack) { - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } static inline bool event_is_checkpointed(struct perf_event *event) @@ -2583,7 +2596,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * so we don't trigger the event without PEBS bit set. */ if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); + static_call(x86_pmu_pebs_disable)(event); } static void intel_pmu_assign_event(struct perf_event *event, int idx) @@ -2603,6 +2616,9 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } static int icl_set_topdown_event_period(struct perf_event *event) @@ -2619,15 +2635,15 @@ static int icl_set_topdown_event_period(struct perf_event *event) * Don't need to clear them again. */ if (left == x86_pmu.max_period) { - wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); - wrmsrl(MSR_PERF_METRICS, 0); + wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrq(MSR_PERF_METRICS, 0); hwc->saved_slots = 0; hwc->saved_metric = 0; } if ((hwc->saved_slots) && is_slots_event(event)) { - wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); - wrmsrl(MSR_PERF_METRICS, hwc->saved_metric); + wrmsrq(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); + wrmsrq(MSR_PERF_METRICS, hwc->saved_metric); } perf_event_update_userpage(event); @@ -2724,12 +2740,12 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, if (!val) { /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + slots = rdpmc(3 | INTEL_PMC_FIXED_RDPMC_BASE); if (!slots) return 0; /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + metrics = rdpmc(INTEL_PMC_FIXED_RDPMC_METRICS); } else { slots = val[0]; metrics = val[1]; @@ -2773,8 +2789,8 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, if (reset) { /* The fixed counter 3 has to be written before the PERF_METRICS. */ - wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); - wrmsrl(MSR_PERF_METRICS, 0); + wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrq(MSR_PERF_METRICS, 0); if (event) update_saved_topdown_regs(event, 0, 0, metric_end); } @@ -2880,6 +2896,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2887,16 +2949,19 @@ static void intel_pmu_enable_event(struct perf_event *event) int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); + static_call(x86_pmu_pebs_enable)(event); switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2914,12 +2979,51 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + +void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!cpuc->n_late_setup) + return; + + intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); +} + static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } /* @@ -2937,7 +3041,7 @@ int intel_pmu_save_and_restart(struct perf_event *event) */ if (unlikely(event_is_checkpointed(event))) { /* No race with NMIs because the counter should not be armed */ - wrmsrl(event->hw.event_base, 0); + wrmsrq(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } return static_call(x86_pmu_set_period)(event); @@ -2976,13 +3080,13 @@ static void intel_pmu_reset(void) pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for_each_set_bit(idx, cntr_mask, INTEL_PMC_MAX_GENERIC) { - wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); - wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); + wrmsrq_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrq_safe(x86_pmu_event_addr(idx), 0ull); } for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - wrmsrl_safe(x86_pmu_fixed_ctr_addr(idx), 0ull); + wrmsrq_safe(x86_pmu_fixed_ctr_addr(idx), 0ull); } if (ds) @@ -2991,7 +3095,7 @@ static void intel_pmu_reset(void) /* Ack all overflows and disable fixed counters */ if (x86_pmu.version >= 2) { intel_pmu_ack_status(intel_pmu_get_status()); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0); } /* Reset LBRs and LBR freezing */ @@ -3035,8 +3139,7 @@ static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, continue; perf_sample_data_init(data, 0, event->hw.last_period); - if (perf_event_overflow(event, data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, data, regs); /* Inject one fake event is enough. */ break; @@ -3049,7 +3152,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; int handled = 0; - u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); inc_irq_stat(apic_perf_irqs); @@ -3093,7 +3195,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); static_call(x86_pmu_drain_pebs)(regs, &data); - status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* * PMI throttle may be triggered, which stops the PEBS event. @@ -3103,7 +3204,16 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * Update the MSR if pebs_enabled is changed. */ if (pebs_enabled != cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + /* + * Above PEBS handler (PEBS counters snapshotting) has updated fixed + * counter 3 and perf metrics counts if they are in counter group, + * unnecessary to update again. + */ + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; } /* @@ -3123,6 +3233,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) static_call(intel_pmu_update_topdown_event)(NULL, NULL); } + status &= hybrid(cpuc->pmu, intel_ctrl); + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status @@ -3132,6 +3244,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + u64 last_period; handled++; @@ -3159,16 +3272,17 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (is_pebs_counter_event_group(event)) x86_pmu.drain_pebs(regs, &data); + last_period = event->hw.last_period; + if (!intel_pmu_save_and_restart(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); if (has_branch_stack(event)) intel_pmu_lbr_save_brstack(&data, cpuc, event); - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } return handled; @@ -3730,10 +3844,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4074,6 +4187,39 @@ end: return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4135,15 +4281,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4198,6 +4348,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -4345,7 +4583,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return arr; /* @@ -4386,7 +4624,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) arr[pebs_enable] = (struct perf_guest_switch_msr){ .msr = MSR_IA32_PEBS_ENABLE, .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, - .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable, }; if (arr[pebs_enable].host) { @@ -4943,7 +5181,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -5032,7 +5270,7 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } -static void update_pmu_cap(struct x86_hybrid_pmu *pmu) +static void update_pmu_cap(struct pmu *pmu) { unsigned int cntr, fixed_cntr, ecx, edx; union cpuid35_eax eax; @@ -5041,20 +5279,30 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); if (ebx.split.umask2) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; if (ebx.split.eq) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); - pmu->cntr_mask64 = cntr; - pmu->fixed_cntr_mask64 = fixed_cntr; + hybrid(pmu, cntr_mask64) = cntr; + hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + } + + if (eax.split.acr_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, + &cntr, &fixed_cntr, &ecx, &edx); + /* The mask of the counters which can be reloaded */ + hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + + /* The mask of the counters which can cause a reload of reloadable counters */ + hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); } if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ - rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities); } } @@ -5141,7 +5389,7 @@ static bool init_hybrid_pmu(int cpu) goto end; if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) - update_pmu_cap(pmu); + update_pmu_cap(&pmu->pmu); intel_pmu_check_hybrid_pmus(pmu); @@ -5202,7 +5450,7 @@ static void intel_pmu_cpu_starting(int cpu) if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) { union perf_capabilities perf_cap; - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); @@ -5515,7 +5763,7 @@ static __init void intel_clovertown_quirk(void) * these chips. */ pr_warn("PEBS disabled due to CPU errata\n"); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; x86_pmu.pebs_constraints = NULL; } @@ -5610,24 +5858,24 @@ static bool check_msr(unsigned long msr, u64 mask) * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - if (rdmsrl_safe(msr, &val_old)) + if (rdmsrq_safe(msr, &val_old)) return false; /* - * Only change the bits which can be updated by wrmsrl. + * Only change the bits which can be updated by wrmsrq. */ val_tmp = val_old ^ mask; if (is_lbr_from(msr)) val_tmp = lbr_from_signext_quirk_wr(val_tmp); - if (wrmsrl_safe(msr, val_tmp) || - rdmsrl_safe(msr, &val_new)) + if (wrmsrq_safe(msr, val_tmp) || + rdmsrq_safe(msr, &val_new)) return false; /* - * Quirk only affects validation in wrmsr(), so wrmsrl()'s value - * should equal rdmsrl()'s even with the quirk. + * Quirk only affects validation in wrmsr(), so wrmsrq()'s value + * should equal rdmsrq()'s even with the quirk. */ if (val_new != val_tmp) return false; @@ -5638,7 +5886,7 @@ static bool check_msr(unsigned long msr, u64 mask) /* Here it's sure that the MSR can be safely accessed. * Restore the old value and return. */ - wrmsrl(msr, val_old); + wrmsrq(msr, val_old); return true; } @@ -6003,7 +6251,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.pebs ? attr->mode : 0; + return x86_pmu.ds_pebs ? attr->mode : 0; } static umode_t @@ -6034,6 +6282,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6076,6 +6339,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6090,6 +6359,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6374,6 +6644,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6566,6 +6837,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) @@ -6626,6 +6898,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; + x86_pmu.config_mask = X86_RAW_EVENT_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -6642,7 +6915,7 @@ __init int intel_pmu_init(void) if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; - rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, capabilities); x86_pmu.intel_cap.capabilities = capabilities; } @@ -6654,7 +6927,7 @@ __init int intel_pmu_init(void) if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) intel_pmu_arch_lbr_init(); - intel_ds_init(); + intel_pebs_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ @@ -6665,6 +6938,12 @@ __init int intel_pmu_init(void) } /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; + /* * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_vfm) { @@ -6875,6 +7154,18 @@ __init int intel_pmu_init(void) name = "crestmont"; break; + case INTEL_ATOM_DARKMONT_X: + intel_pmu_init_skt(NULL); + intel_pmu_pebs_data_source_cmt(); + x86_pmu.pebs_latency_data = cmt_latency_data; + x86_pmu.get_event_constraints = cmt_get_event_constraints; + td_attr = skt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = cmt_format_attr; + pr_cont("Darkmont events, "); + name = "darkmont"; + break; + case INTEL_WESTMERE: case INTEL_WESTMERE_EP: case INTEL_WESTMERE_EX: @@ -7305,8 +7596,17 @@ __init int intel_pmu_init(void) name = "meteorlake_hybrid"; break; + case INTEL_PANTHERLAKE_L: + pr_cont("Pantherlake Hybrid events, "); + name = "pantherlake_hybrid"; + goto lnl_common; + case INTEL_LUNARLAKE_M: case INTEL_ARROWLAKE: + pr_cont("Lunarlake Hybrid events, "); + name = "lunarlake_hybrid"; + + lnl_common: intel_pmu_init_hybrid(hybrid_big_small); x86_pmu.pebs_latency_data = lnl_latency_data; @@ -7328,8 +7628,6 @@ __init int intel_pmu_init(void) intel_pmu_init_skt(&pmu->pmu); intel_pmu_pebs_data_source_lnl(); - pr_cont("Lunarlake Hybrid events, "); - name = "lunarlake_hybrid"; break; case INTEL_ARROWLAKE_H: @@ -7417,6 +7715,18 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } + /* + * The archPerfmonExt (0x23) includes an enhanced enumeration of + * PMU architectural features with a per-core view. For non-hybrid, + * each core has the same PMU capabilities. It's good enough to + * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu + * is used to keep the common capabilities. Still keep the values + * from the leaf 0xa. The core specific update will be done later + * when a new type is online. + */ + if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) + update_pmu_cap(NULL); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index ae4ec16156bb..ec753e39b007 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -111,6 +111,7 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "../perf_event.h" #include "../probe.h" @@ -320,7 +321,7 @@ static inline u64 cstate_pmu_read_counter(struct perf_event *event) { u64 val; - rdmsrl(event->hw.event_base, val); + rdmsrq(event->hw.event_base, val); return val; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1f7e1a692a7a..c0b7ac1c7594 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -10,6 +10,7 @@ #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> +#include <asm/msr.h> #include <asm/timer.h> #include "../perf_event.h" @@ -624,7 +625,7 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); @@ -659,7 +660,7 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -734,7 +735,7 @@ void release_ds_buffers(void) { int cpu; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; for_each_possible_cpu(cpu) @@ -750,7 +751,8 @@ void release_ds_buffers(void) } for_each_possible_cpu(cpu) { - release_pebs_buffer(cpu); + if (x86_pmu.ds_pebs) + release_pebs_buffer(cpu); release_bts_buffer(cpu); } } @@ -761,15 +763,17 @@ void reserve_ds_buffers(void) int cpu; x86_pmu.bts_active = 0; - x86_pmu.pebs_active = 0; - if (!x86_pmu.bts && !x86_pmu.pebs) + if (x86_pmu.ds_pebs) + x86_pmu.pebs_active = 0; + + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; if (!x86_pmu.bts) bts_err = 1; - if (!x86_pmu.pebs) + if (!x86_pmu.ds_pebs) pebs_err = 1; for_each_possible_cpu(cpu) { @@ -781,7 +785,8 @@ void reserve_ds_buffers(void) if (!bts_err && alloc_bts_buffer(cpu)) bts_err = 1; - if (!pebs_err && alloc_pebs_buffer(cpu)) + if (x86_pmu.ds_pebs && !pebs_err && + alloc_pebs_buffer(cpu)) pebs_err = 1; if (bts_err && pebs_err) @@ -793,7 +798,7 @@ void reserve_ds_buffers(void) release_bts_buffer(cpu); } - if (pebs_err) { + if (x86_pmu.ds_pebs && pebs_err) { for_each_possible_cpu(cpu) release_pebs_buffer(cpu); } @@ -805,7 +810,7 @@ void reserve_ds_buffers(void) if (x86_pmu.bts && !bts_err) x86_pmu.bts_active = 1; - if (x86_pmu.pebs && !pebs_err) + if (x86_pmu.ds_pebs && !pebs_err) x86_pmu.pebs_active = 1; for_each_possible_cpu(cpu) { @@ -1355,9 +1360,8 @@ static void __intel_pmu_pebs_update_cfg(struct perf_event *event, } -static void intel_pmu_late_setup(void) +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; u64 pebs_data_cfg = 0; int i; @@ -1399,8 +1403,10 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) * + precise_ip < 2 for the non event IP * + For RTM TSX weight we need GPRs for the abort code. */ - gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && - (attr->sample_regs_intr & PEBS_GP_REGS); + gprs = ((sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_GP_REGS)) || + ((sample_type & PERF_SAMPLE_REGS_USER) && + (attr->sample_regs_user & PEBS_GP_REGS)); tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == @@ -1515,7 +1521,7 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) else value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx]; } - wrmsrl(base + idx, value); + wrmsrq(base + idx, value); } static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) @@ -1552,7 +1558,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) */ intel_pmu_drain_pebs_buffer(); adaptive_pebs_record_size_update(); - wrmsrl(MSR_PEBS_DATA_CFG, pebs_data_cfg); + wrmsrq(MSR_PEBS_DATA_CFG, pebs_data_cfg); cpuc->active_pebs_data_cfg = pebs_data_cfg; } } @@ -1615,7 +1621,7 @@ void intel_pmu_pebs_disable(struct perf_event *event) intel_pmu_pebs_via_pt_disable(event); if (cpuc->enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); hwc->config |= ARCH_PERFMON_EVENTSEL_INT; } @@ -1625,7 +1631,7 @@ void intel_pmu_pebs_enable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } void intel_pmu_pebs_disable_all(void) @@ -1826,8 +1832,6 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; - /* * Use latency for weight (only avail with PEBS-LL) */ @@ -2080,7 +2084,6 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, sample_type = event->attr.sample_type; format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); - data->period = event->hw.last_period; setup_pebs_time(event, data, basic->tsc); @@ -2123,7 +2126,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, regs->flags &= ~PERF_EFLAGS_EXACT; } - if (sample_type & PERF_SAMPLE_REGS_INTR) + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) adaptive_pebs_save_regs(regs, gprs); } @@ -2274,7 +2277,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) WARN_ON(this_cpu_read(cpu_hw_events.enabled)); prev_raw_count = local64_read(&hwc->prev_count); - rdpmcl(hwc->event_base_rdpmc, new_raw_count); + new_raw_count = rdpmc(hwc->event_base_rdpmc); local64_set(&hwc->prev_count, new_raw_count); /* @@ -2357,8 +2360,7 @@ __intel_pmu_pebs_last_event(struct perf_event *event, * All but the last records are processed. * The last one is left to be able to call the overflow handler. */ - if (perf_event_overflow(event, data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, data, regs); } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { @@ -2377,8 +2379,25 @@ __intel_pmu_pebs_last_event(struct perf_event *event, */ intel_pmu_save_and_restart_reload(event, count); } - } else - intel_pmu_save_and_restart(event); + } else { + /* + * For a non-precise event, it's possible the + * counters-snapshotting records a positive value for the + * overflowed event. Then the HW auto-reload mechanism + * reset the counter to 0 immediately, because the + * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD + * is not set. The counter backwards may be observed in a + * PMI handler. + * + * Since the event value has been updated when processing the + * counters-snapshotting record, only needs to set the new + * period for the counter. + */ + if (is_pebs_counter_event_group(event)) + static_call(x86_pmu_set_period)(event); + else + intel_pmu_save_and_restart(event); + } } static __always_inline void @@ -2446,8 +2465,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ setup_pebs_fixed_sample_data); } -static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask) { + u64 pebs_enabled = cpuc->pebs_enabled & mask; struct perf_event *event; int bit; @@ -2458,7 +2478,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int * It needs to call intel_pmu_save_and_restart_reload() to * update the event->count for this case. */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -2493,7 +2513,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2569,8 +2589,8 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d if (error[bit]) { perf_log_lost_samples(event, error[bit]); - if (iregs && perf_event_account_interrupt(event)) - x86_pmu_stop(event, 0); + if (iregs) + perf_event_account_interrupt(event); } if (counts[bit]) { @@ -2607,7 +2627,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2650,10 +2670,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d } /* - * BTS, PEBS probe and setup + * PEBS probe and setup */ -void __init intel_ds_init(void) +void __init intel_pebs_init(void) { /* * No support for 32bit formats @@ -2661,13 +2681,12 @@ void __init intel_ds_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64)) return; - x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); - x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + x86_pmu.ds_pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; if (x86_pmu.version <= 4) x86_pmu.pebs_no_isolation = 1; - if (x86_pmu.pebs) { + if (x86_pmu.ds_pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; @@ -2675,6 +2694,11 @@ void __init intel_ds_init(void) if (format < 4) x86_pmu.intel_cap.pebs_baseline = 0; + x86_pmu.pebs_enable = intel_pmu_pebs_enable; + x86_pmu.pebs_disable = intel_pmu_pebs_disable; + x86_pmu.pebs_enable_all = intel_pmu_pebs_enable_all; + x86_pmu.pebs_disable_all = intel_pmu_pebs_disable_all; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); @@ -2759,7 +2783,7 @@ void __init intel_ds_init(void) default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); - x86_pmu.pebs = 0; + x86_pmu.ds_pebs = 0; } } } @@ -2768,8 +2792,8 @@ void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); - if (!x86_pmu.bts && !x86_pmu.pebs) + if (!x86_pmu.bts && !x86_pmu.ds_pebs) return; - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); + wrmsrq(MSR_IA32_DS_AREA, (unsigned long)ds); } diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c index 034a1f6a457c..e614baf42926 100644 --- a/arch/x86/events/intel/knc.c +++ b/arch/x86/events/intel/knc.c @@ -5,6 +5,7 @@ #include <linux/types.h> #include <asm/hardirq.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -159,18 +160,18 @@ static void knc_pmu_disable_all(void) { u64 val; - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); } static void knc_pmu_enable_all(int added) { u64 val; - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); } static inline void @@ -182,7 +183,7 @@ knc_pmu_disable_event(struct perf_event *event) val = hwc->config; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); + (void)wrmsrq_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) @@ -193,21 +194,21 @@ static void knc_pmu_enable_event(struct perf_event *event) val = hwc->config; val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); + (void)wrmsrq_safe(hwc->config_base + hwc->idx, val); } static inline u64 knc_pmu_get_status(void) { u64 status; - rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + rdmsrq(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); return status; } static inline void knc_pmu_ack_status(u64 ack) { - wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); + wrmsrq(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); } static int knc_pmu_handle_irq(struct pt_regs *regs) @@ -241,19 +242,20 @@ again: for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + u64 last_period; handled++; if (!test_bit(bit, cpuc->active_mask)) continue; + last_period = event->hw.last_period; if (!intel_pmu_save_and_restart(event)) continue; - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(&data, 0, last_period); - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } /* diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f44c3d866f24..7aa59966e7c3 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -137,9 +137,9 @@ static void __intel_pmu_lbr_enable(bool pmi) if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) - wrmsrl(MSR_LBR_SELECT, lbr_select); + wrmsrq(MSR_LBR_SELECT, lbr_select); - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) @@ -155,10 +155,10 @@ static void __intel_pmu_lbr_enable(bool pmi) debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; if (orig_debugctl != debugctl) - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); if (static_cpu_has(X86_FEATURE_ARCH_LBR)) - wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); + wrmsrq(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } void intel_pmu_lbr_reset_32(void) @@ -166,7 +166,7 @@ void intel_pmu_lbr_reset_32(void) int i; for (i = 0; i < x86_pmu.lbr_nr; i++) - wrmsrl(x86_pmu.lbr_from + i, 0); + wrmsrq(x86_pmu.lbr_from + i, 0); } void intel_pmu_lbr_reset_64(void) @@ -174,17 +174,17 @@ void intel_pmu_lbr_reset_64(void) int i; for (i = 0; i < x86_pmu.lbr_nr; i++) { - wrmsrl(x86_pmu.lbr_from + i, 0); - wrmsrl(x86_pmu.lbr_to + i, 0); + wrmsrq(x86_pmu.lbr_from + i, 0); + wrmsrq(x86_pmu.lbr_to + i, 0); if (x86_pmu.lbr_has_info) - wrmsrl(x86_pmu.lbr_info + i, 0); + wrmsrq(x86_pmu.lbr_info + i, 0); } } static void intel_pmu_arch_lbr_reset(void) { /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ - wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); + wrmsrq(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); } void intel_pmu_lbr_reset(void) @@ -199,7 +199,7 @@ void intel_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select) - wrmsrl(MSR_LBR_SELECT, 0); + wrmsrq(MSR_LBR_SELECT, 0); } /* @@ -209,7 +209,7 @@ static inline u64 intel_pmu_lbr_tos(void) { u64 tos; - rdmsrl(x86_pmu.lbr_tos, tos); + rdmsrq(x86_pmu.lbr_tos, tos); return tos; } @@ -282,17 +282,17 @@ static u64 lbr_from_signext_quirk_rd(u64 val) static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); - wrmsrl(x86_pmu.lbr_from + idx, val); + wrmsrq(x86_pmu.lbr_from + idx, val); } static __always_inline void wrlbr_to(unsigned int idx, u64 val) { - wrmsrl(x86_pmu.lbr_to + idx, val); + wrmsrq(x86_pmu.lbr_to + idx, val); } static __always_inline void wrlbr_info(unsigned int idx, u64 val) { - wrmsrl(x86_pmu.lbr_info + idx, val); + wrmsrq(x86_pmu.lbr_info + idx, val); } static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) @@ -302,7 +302,7 @@ static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) if (lbr) return lbr->from; - rdmsrl(x86_pmu.lbr_from + idx, val); + rdmsrq(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } @@ -314,7 +314,7 @@ static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) if (lbr) return lbr->to; - rdmsrl(x86_pmu.lbr_to + idx, val); + rdmsrq(x86_pmu.lbr_to + idx, val); return val; } @@ -326,7 +326,7 @@ static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) if (lbr) return lbr->info; - rdmsrl(x86_pmu.lbr_info + idx, val); + rdmsrq(x86_pmu.lbr_info + idx, val); return val; } @@ -380,10 +380,10 @@ void intel_pmu_lbr_restore(void *ctx) wrlbr_info(lbr_idx, 0); } - wrmsrl(x86_pmu.lbr_tos, tos); + wrmsrq(x86_pmu.lbr_tos, tos); if (cpuc->lbr_select) - wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); + wrmsrq(MSR_LBR_SELECT, task_ctx->lbr_sel); } static void intel_pmu_arch_lbr_restore(void *ctx) @@ -475,7 +475,7 @@ void intel_pmu_lbr_save(void *ctx) task_ctx->tos = tos; if (cpuc->lbr_select) - rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); + rdmsrq(MSR_LBR_SELECT, task_ctx->lbr_sel); } static void intel_pmu_arch_lbr_save(void *ctx) @@ -752,7 +752,7 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) u64 lbr; } msr_lastbranch; - rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); + rdmsrq(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); perf_clear_branch_entry_bitfields(br); @@ -1602,7 +1602,7 @@ void __init intel_pmu_arch_lbr_init(void) goto clear_arch_lbr; /* Apply the max depth of Arch LBR */ - if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + if (wrmsrq_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) goto clear_arch_lbr; x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; @@ -1618,7 +1618,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index c85a9fc44355..e5fd7367e45d 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -13,6 +13,7 @@ #include <asm/cpu_device_id.h> #include <asm/hardirq.h> #include <asm/apic.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -859,9 +860,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) u64 v; /* an official way for overflow indication */ - rdmsrl(hwc->config_base, v); + rdmsrq(hwc->config_base, v); if (v & P4_CCCR_OVF) { - wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); + wrmsrq(hwc->config_base, v & ~P4_CCCR_OVF); return 1; } @@ -872,7 +873,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) * the counter has reached zero value and continued counting before * real NMI signal was received: */ - rdmsrl(hwc->event_base, v); + rdmsrq(hwc->event_base, v); if (!(v & ARCH_P4_UNFLAGGED_BIT)) return 1; @@ -897,8 +898,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0); - * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0); + * (void)wrmsrq_safe(MSR_IA32_PEBS_ENABLE, 0); + * (void)wrmsrq_safe(MSR_P4_PEBS_MATRIX_VERT, 0); */ } @@ -911,7 +912,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)wrmsrl_safe(hwc->config_base, + (void)wrmsrq_safe(hwc->config_base, p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -944,8 +945,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrq_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrq_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void __p4_pmu_enable_event(struct perf_event *event) @@ -979,8 +980,8 @@ static void __p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)wrmsrl_safe(escr_addr, escr_conf); - (void)wrmsrl_safe(hwc->config_base, + (void)wrmsrq_safe(escr_addr, escr_conf); + (void)wrmsrq_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -1024,7 +1025,7 @@ static int p4_pmu_set_period(struct perf_event *event) * * the former idea is taken from OProfile code */ - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); } return ret; @@ -1072,8 +1073,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) continue; - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } if (handled) @@ -1398,7 +1398,7 @@ __init int p4_pmu_init(void) */ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); - wrmsrl_safe(reg, 0ULL); + wrmsrq_safe(reg, 0ULL); } return 0; diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index 65b45e9d7016..6e41de355bd8 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -3,6 +3,7 @@ #include <linux/types.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -142,9 +143,9 @@ static void p6_pmu_disable_all(void) u64 val; /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); + rdmsrq(MSR_P6_EVNTSEL0, val); val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + wrmsrq(MSR_P6_EVNTSEL0, val); } static void p6_pmu_enable_all(int added) @@ -152,9 +153,9 @@ static void p6_pmu_enable_all(int added) unsigned long val; /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); + rdmsrq(MSR_P6_EVNTSEL0, val); val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + wrmsrq(MSR_P6_EVNTSEL0, val); } static inline void @@ -163,7 +164,7 @@ p6_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - (void)wrmsrl_safe(hwc->config_base, val); + (void)wrmsrq_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -180,7 +181,7 @@ static void p6_pmu_enable_event(struct perf_event *event) * to actually enable the events. */ - (void)wrmsrl_safe(hwc->config_base, val); + (void)wrmsrq_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fa37565f6418..e8cf29d2b10c 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -18,12 +18,13 @@ #include <linux/slab.h> #include <linux/device.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/insn.h> #include <asm/io.h> #include <asm/intel_pt.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "../perf_event.h" #include "pt.h" @@ -194,7 +195,7 @@ static int __init pt_pmu_hw_init(void) int ret; long i; - rdmsrl(MSR_PLATFORM_INFO, reg); + rdmsrq(MSR_PLATFORM_INFO, reg); pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; /* @@ -230,7 +231,7 @@ static int __init pt_pmu_hw_init(void) * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace * post-VMXON. */ - rdmsrl(MSR_IA32_VMX_MISC, reg); + rdmsrq(MSR_IA32_VMX_MISC, reg); if (reg & BIT(14)) pt_pmu.vmx = true; } @@ -426,7 +427,7 @@ static void pt_config_start(struct perf_event *event) if (READ_ONCE(pt->vmx_on)) perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); else - wrmsrl(MSR_IA32_RTIT_CTL, ctl); + wrmsrq(MSR_IA32_RTIT_CTL, ctl); WRITE_ONCE(event->hw.aux_config, ctl); } @@ -485,12 +486,12 @@ static u64 pt_config_filters(struct perf_event *event) /* avoid redundant msr writes */ if (pt->filters.filter[range].msr_a != filter->msr_a) { - wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); + wrmsrq(pt_address_ranges[range].msr_a, filter->msr_a); pt->filters.filter[range].msr_a = filter->msr_a; } if (pt->filters.filter[range].msr_b != filter->msr_b) { - wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); + wrmsrq(pt_address_ranges[range].msr_b, filter->msr_b); pt->filters.filter[range].msr_b = filter->msr_b; } @@ -509,7 +510,7 @@ static void pt_config(struct perf_event *event) /* First round: clear STATUS, in particular the PSB byte counter. */ if (!event->hw.aux_config) { perf_event_itrace_started(event); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); + wrmsrq(MSR_IA32_RTIT_STATUS, 0); } reg = pt_config_filters(event); @@ -569,7 +570,7 @@ static void pt_config_stop(struct perf_event *event) ctl &= ~RTIT_CTL_TRACEEN; if (!READ_ONCE(pt->vmx_on)) - wrmsrl(MSR_IA32_RTIT_CTL, ctl); + wrmsrq(MSR_IA32_RTIT_CTL, ctl); WRITE_ONCE(event->hw.aux_config, ctl); @@ -658,13 +659,13 @@ static void pt_config_buffer(struct pt_buffer *buf) reg = virt_to_phys(base); if (pt->output_base != reg) { pt->output_base = reg; - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, reg); } reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32); if (pt->output_mask != reg) { pt->output_mask = reg; - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, reg); } } @@ -926,7 +927,7 @@ static void pt_handle_status(struct pt *pt) int advance = 0; u64 status; - rdmsrl(MSR_IA32_RTIT_STATUS, status); + rdmsrq(MSR_IA32_RTIT_STATUS, status); if (status & RTIT_STATUS_ERROR) { pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); @@ -970,7 +971,7 @@ static void pt_handle_status(struct pt *pt) if (advance) pt_buffer_advance(buf); - wrmsrl(MSR_IA32_RTIT_STATUS, status); + wrmsrq(MSR_IA32_RTIT_STATUS, status); } /** @@ -985,12 +986,12 @@ static void pt_read_offset(struct pt_buffer *buf) struct topa_page *tp; if (!buf->single) { - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); tp = phys_to_virt(pt->output_base); buf->cur = &tp->topa; } - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); /* offset within current output region */ buf->output_off = pt->output_mask >> 32; /* index of current output region within this table */ @@ -1585,7 +1586,7 @@ void intel_pt_handle_vmx(int on) /* Turn PTs back on */ if (!on && event) - wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config); + wrmsrq(MSR_IA32_RTIT_CTL, event->hw.aux_config); local_irq_restore(flags); } @@ -1611,7 +1612,7 @@ static void pt_event_start(struct perf_event *event, int mode) * PMI might have just cleared these, so resume_allowed * must be checked again also. */ - rdmsrl(MSR_IA32_RTIT_STATUS, status); + rdmsrq(MSR_IA32_RTIT_STATUS, status); if (!(status & (RTIT_STATUS_TRIGGEREN | RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED)) && @@ -1839,7 +1840,7 @@ static __init int pt_init(void) for_each_online_cpu(cpu) { u64 ctl; - ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); + ret = rdmsrq_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); if (!ret && (ctl & RTIT_CTL_TRACEEN)) prior_warn++; } @@ -1863,6 +1864,8 @@ static __init int pt_init(void) if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; + else + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE | diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index a34e50fc4a8f..e0815a12db90 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -3,6 +3,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "uncore.h" #include "uncore_discovery.h" @@ -150,7 +151,7 @@ u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *eve { u64 count; - rdmsrl(event->hw.event_base, count); + rdmsrq(event->hw.event_base, count); return count; } @@ -305,17 +306,11 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) { struct intel_uncore_box *box; struct perf_event *event; - unsigned long flags; int bit; box = container_of(hrtimer, struct intel_uncore_box, hrtimer); if (!box->n_active || box->cpu != smp_processor_id()) return HRTIMER_NORESTART; - /* - * disable local interrupt to prevent uncore_pmu_event_start/stop - * to interrupt the update process - */ - local_irq_save(flags); /* * handle boxes with an active event list as opposed to active @@ -328,8 +323,6 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) uncore_perf_event_update(box, box->events[bit]); - local_irq_restore(flags); - hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); return HRTIMER_RESTART; } @@ -337,7 +330,7 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) @@ -347,7 +340,7 @@ void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) { - hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 571e44b49691..18a3022f26a0 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -5,6 +5,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/msr.h> #include "uncore.h" #include "uncore_discovery.h" @@ -441,17 +442,17 @@ static u64 intel_generic_uncore_box_ctl(struct intel_uncore_box *box) void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box) { - wrmsrl(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_INT); + wrmsrq(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_INT); } void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box) { - wrmsrl(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ); + wrmsrq(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ); } void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box) { - wrmsrl(intel_generic_uncore_box_ctl(box), 0); + wrmsrq(intel_generic_uncore_box_ctl(box), 0); } static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box, @@ -459,7 +460,7 @@ static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box, { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config); + wrmsrq(hwc->config_base, hwc->config); } static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box, @@ -467,7 +468,7 @@ static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box, { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, 0); + wrmsrq(hwc->config_base, 0); } static struct intel_uncore_ops generic_uncore_msr_ops = { diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 466833478e81..8962e7cb21e3 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem-EX/Westmere-EX uncore support */ #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "uncore.h" /* NHM-EX event control */ @@ -200,12 +201,12 @@ DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) { - wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); + wrmsrq(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); } static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box) { - wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0); + wrmsrq(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0); } static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) @@ -214,12 +215,12 @@ static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) u64 config; if (msr) { - rdmsrl(msr, config); + rdmsrq(msr, config); config &= ~((1ULL << uncore_num_counters(box)) - 1); /* WBox has a fixed counter */ if (uncore_msr_fixed_ctl(box)) config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); + wrmsrq(msr, config); } } @@ -229,18 +230,18 @@ static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) u64 config; if (msr) { - rdmsrl(msr, config); + rdmsrq(msr, config); config |= (1ULL << uncore_num_counters(box)) - 1; /* WBox has a fixed counter */ if (uncore_msr_fixed_ctl(box)) config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); + wrmsrq(msr, config); } } static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) { - wrmsrl(event->hw.config_base, 0); + wrmsrq(event->hw.config_base, 0); } static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -248,11 +249,11 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p struct hw_perf_event *hwc = &event->hw; if (hwc->idx == UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); else - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); + wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); } #define NHMEX_UNCORE_OPS_COMMON_INIT() \ @@ -382,10 +383,10 @@ static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event_extra *reg2 = &hwc->branch_reg; if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, reg2->config); + wrmsrq(reg1->reg, reg1->config); + wrmsrq(reg1->reg + 1, reg2->config); } - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); } @@ -467,12 +468,12 @@ static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event_extra *reg2 = &hwc->branch_reg; if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, 0); - wrmsrl(reg1->reg + 1, reg1->config); - wrmsrl(reg1->reg + 2, reg2->config); - wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + wrmsrq(reg1->reg, 0); + wrmsrq(reg1->reg + 1, reg1->config); + wrmsrq(reg1->reg + 2, reg2->config); + wrmsrq(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); } - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); } static struct attribute *nhmex_uncore_sbox_formats_attr[] = { @@ -842,25 +843,25 @@ static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct per idx = __BITS_VALUE(reg1->idx, 0, 8); if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + wrmsrq(__BITS_VALUE(reg1->reg, 0, 16), nhmex_mbox_shared_reg_config(box, idx)); idx = __BITS_VALUE(reg1->idx, 1, 8); if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + wrmsrq(__BITS_VALUE(reg1->reg, 1, 16), nhmex_mbox_shared_reg_config(box, idx)); if (reg2->idx != EXTRA_REG_NONE) { - wrmsrl(reg2->reg, 0); + wrmsrq(reg2->reg, 0); if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, + wrmsrq(reg2->reg + 1, reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + wrmsrq(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + wrmsrq(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); } } - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); + wrmsrq(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); } DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); @@ -1121,31 +1122,31 @@ static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct per switch (idx % 6) { case 0: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + wrmsrq(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); break; case 1: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); + wrmsrq(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); break; case 2: case 3: - wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + wrmsrq(NHMEX_R_MSR_PORTN_QLX_CFG(port), uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); break; case 4: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); break; case 5: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrq(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); break; } - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + wrmsrq(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); } diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index edb7fd50efe0..a1a96833e30e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ +#include <asm/msr.h> #include "uncore.h" #include "uncore_discovery.h" @@ -260,34 +261,34 @@ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event *hwc = &event->hw; if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); else - wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); + wrmsrq(hwc->config_base, SNB_UNC_CTL_EN); } static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) { - wrmsrl(event->hw.config_base, 0); + wrmsrq(event->hw.config_base, 0); } static void snb_uncore_msr_init_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) { - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + wrmsrq(SNB_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); } } static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) { - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + wrmsrq(SNB_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); } static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0); + wrmsrq(SNB_UNC_PERF_GLOBAL_CTL, 0); } static struct uncore_event_desc snb_uncore_events[] = { @@ -372,7 +373,7 @@ void snb_uncore_cpu_init(void) static void skl_uncore_msr_init_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) { - wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); } @@ -383,14 +384,14 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) { - wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); } static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, 0); + wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, 0); } static struct intel_uncore_ops skl_uncore_msr_ops = { @@ -504,7 +505,7 @@ static int icl_get_cbox_num(void) { u64 num_boxes; - rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes); + rdmsrq(ICL_UNC_CBO_CONFIG, num_boxes); return num_boxes & ICL_UNC_NUM_CBO_MASK; } @@ -525,7 +526,7 @@ static struct intel_uncore_type *tgl_msr_uncores[] = { static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); + wrmsrq(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); } void tgl_uncore_cpu_init(void) @@ -541,24 +542,24 @@ void tgl_uncore_cpu_init(void) static void adl_uncore_msr_init_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); + wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); } static void adl_uncore_msr_enable_box(struct intel_uncore_box *box) { - wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); + wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); } static void adl_uncore_msr_disable_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); + wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, 0); } static void adl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); + wrmsrq(ADL_UNC_PERF_GLOBAL_CTL, 0); } static struct intel_uncore_ops adl_uncore_msr_ops = { @@ -691,7 +692,7 @@ static struct intel_uncore_type mtl_uncore_hac_cbox = { static void mtl_uncore_msr_init_box(struct intel_uncore_box *box) { - wrmsrl(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN); + wrmsrq(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN); } static struct intel_uncore_ops mtl_uncore_msr_ops = { @@ -758,7 +759,7 @@ static struct intel_uncore_type *lnl_msr_uncores[] = { static void lnl_uncore_msr_init_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) - wrmsrl(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); + wrmsrq(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); } static struct intel_uncore_ops lnl_uncore_msr_ops = { @@ -1306,12 +1307,12 @@ int skl_uncore_pci_init(void) /* Nehalem uncore support */ static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) { - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); + wrmsrq(NHM_UNC_PERF_GLOBAL_CTL, 0); } static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) { - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); + wrmsrq(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); } static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -1319,9 +1320,9 @@ static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct per struct hw_perf_event *hwc = &event->hw; if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); else - wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); + wrmsrq(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); } static struct attribute *nhm_uncore_formats_attr[] = { diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 60973c209c0e..2824dc9950be 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* SandyBridge-EP/IvyTown uncore support */ #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include "uncore.h" #include "uncore_discovery.h" @@ -618,9 +619,9 @@ static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) msr = uncore_msr_box_ctl(box); if (msr) { - rdmsrl(msr, config); + rdmsrq(msr, config); config |= SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); + wrmsrq(msr, config); } } @@ -631,9 +632,9 @@ static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) msr = uncore_msr_box_ctl(box); if (msr) { - rdmsrl(msr, config); + rdmsrq(msr, config); config &= ~SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); + wrmsrq(msr, config); } } @@ -643,9 +644,9 @@ static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct p struct hw_perf_event_extra *reg1 = &hwc->extra_reg; if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); + wrmsrq(reg1->reg, uncore_shared_reg_config(box, 0)); - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, @@ -653,7 +654,7 @@ static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config); + wrmsrq(hwc->config_base, hwc->config); } static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) @@ -661,7 +662,7 @@ static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) unsigned msr = uncore_msr_box_ctl(box); if (msr) - wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); + wrmsrq(msr, SNBEP_PMON_BOX_CTL_INT); } static struct attribute *snbep_uncore_formats_attr[] = { @@ -1532,7 +1533,7 @@ static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box) { unsigned msr = uncore_msr_box_ctl(box); if (msr) - wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT); + wrmsrq(msr, IVBEP_PMON_BOX_CTL_INT); } static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box) @@ -1783,11 +1784,11 @@ static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_ev if (reg1->idx != EXTRA_REG_NONE) { u64 filter = uncore_shared_reg_config(box, 0); - wrmsrl(reg1->reg, filter & 0xffffffff); - wrmsrl(reg1->reg + 6, filter >> 32); + wrmsrq(reg1->reg, filter & 0xffffffff); + wrmsrq(reg1->reg + 6, filter >> 32); } - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } static struct intel_uncore_ops ivbep_uncore_cbox_ops = { @@ -2767,11 +2768,11 @@ static void hswep_cbox_enable_event(struct intel_uncore_box *box, if (reg1->idx != EXTRA_REG_NONE) { u64 filter = uncore_shared_reg_config(box, 0); - wrmsrl(reg1->reg, filter & 0xffffffff); - wrmsrl(reg1->reg + 1, filter >> 32); + wrmsrq(reg1->reg, filter & 0xffffffff); + wrmsrq(reg1->reg + 1, filter >> 32); } - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } static struct intel_uncore_ops hswep_uncore_cbox_ops = { @@ -2816,7 +2817,7 @@ static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box) for_each_set_bit(i, (unsigned long *)&init, 64) { flags |= (1ULL << i); - wrmsrl(msr, flags); + wrmsrq(msr, flags); } } } @@ -3708,7 +3709,7 @@ static void skx_iio_enable_event(struct intel_uncore_box *box, { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } static struct intel_uncore_ops skx_uncore_iio_ops = { @@ -3765,7 +3766,7 @@ static int skx_msr_cpu_bus_read(int cpu, u64 *topology) { u64 msr_value; - if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + if (rdmsrq_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) return -ENXIO; @@ -4655,9 +4656,9 @@ static void snr_cha_enable_event(struct intel_uncore_box *box, struct hw_perf_event_extra *reg1 = &hwc->extra_reg; if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, reg1->config); + wrmsrq(reg1->reg, reg1->config); - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); + wrmsrq(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); } static struct intel_uncore_ops snr_uncore_chabox_ops = { @@ -4891,28 +4892,28 @@ static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), /* Free-Running IIO BANDWIDTH IN Counters */ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.0517578125e-5"), INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -5485,37 +5486,6 @@ static struct freerunning_counters icx_iio_freerunning[] = { [ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets }, }; -static struct uncore_event_desc icx_uncore_iio_freerunning_events[] = { - /* Free-Running IIO CLOCKS Counter */ - INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), - /* Free-Running IIO BANDWIDTH IN Counters */ - INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), - { /* end: all zeroes */ }, -}; - static struct intel_uncore_type icx_uncore_iio_free_running = { .name = "iio_free_running", .num_counters = 9, @@ -5523,7 +5493,7 @@ static struct intel_uncore_type icx_uncore_iio_free_running = { .num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX, .freerunning = icx_iio_freerunning, .ops = &skx_uncore_iio_freerunning_ops, - .event_descs = icx_uncore_iio_freerunning_events, + .event_descs = snr_uncore_iio_freerunning_events, .format_group = &skx_uncore_iio_freerunning_format_group, }; @@ -5913,9 +5883,9 @@ static void spr_uncore_msr_enable_event(struct intel_uncore_box *box, struct hw_perf_event_extra *reg1 = &hwc->extra_reg; if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, reg1->config); + wrmsrq(reg1->reg, reg1->config); - wrmsrl(hwc->config_base, hwc->config); + wrmsrq(hwc->config_base, hwc->config); } static void spr_uncore_msr_disable_event(struct intel_uncore_box *box, @@ -5925,9 +5895,9 @@ static void spr_uncore_msr_disable_event(struct intel_uncore_box *box, struct hw_perf_event_extra *reg1 = &hwc->extra_reg; if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, 0); + wrmsrq(reg1->reg, 0); - wrmsrl(hwc->config_base, 0); + wrmsrq(hwc->config_base, 0); } static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -6320,69 +6290,13 @@ static struct freerunning_counters spr_iio_freerunning[] = { [SPR_IIO_MSR_BW_OUT] = { 0x3808, 0x1, 0x10, 8, 48 }, }; -static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = { - /* Free-Running IIO CLOCKS Counter */ - INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), - /* Free-Running IIO BANDWIDTH IN Counters */ - INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), - /* Free-Running IIO BANDWIDTH OUT Counters */ - INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x32"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x33"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port4, "event=0xff,umask=0x34"), - INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port5, "event=0xff,umask=0x35"), - INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port6, "event=0xff,umask=0x36"), - INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit, "MiB"), - INTEL_UNCORE_EVENT_DESC(bw_out_port7, "event=0xff,umask=0x37"), - INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale, "3.814697266e-6"), - INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit, "MiB"), - { /* end: all zeroes */ }, -}; - static struct intel_uncore_type spr_uncore_iio_free_running = { .name = "iio_free_running", .num_counters = 17, .num_freerunning_types = SPR_IIO_FREERUNNING_TYPE_MAX, .freerunning = spr_iio_freerunning, .ops = &skx_uncore_iio_freerunning_ops, - .event_descs = spr_uncore_iio_freerunning_events, + .event_descs = snr_uncore_iio_freerunning_events, .format_group = &skx_uncore_iio_freerunning_format_group, }; @@ -6572,7 +6486,7 @@ void spr_uncore_cpu_init(void) * of UNCORE_SPR_CHA) is incorrect on some SPR variants because of a * firmware bug. Using the value from SPR_MSR_UNC_CBO_CONFIG to replace it. */ - rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo); + rdmsrq(SPR_MSR_UNC_CBO_CONFIG, num_cbo); /* * The MSR doesn't work on the EMR XCC, but the firmware bug doesn't impact * the EMR XCC. Don't let the value from the MSR replace the existing value. diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 45b1866ff051..7f5007a4752a 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -3,6 +3,8 @@ #include <linux/sysfs.h> #include <linux/nospec.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> + #include "probe.h" enum perf_msr_id { @@ -231,7 +233,7 @@ static inline u64 msr_read_counter(struct perf_event *event) u64 now; if (event->hw.event_base) - rdmsrl(event->hw.event_base, now); + rdmsrq(event->hw.event_base, now); else now = rdtsc_ordered(); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e..2b969386dcdd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -17,6 +17,7 @@ #include <asm/fpu/xstate.h> #include <asm/intel_ds.h> #include <asm/cpu.h> +#include <asm/msr.h> /* To enable MSR tracing please use the generic trace points. */ @@ -110,14 +111,26 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +int is_x86_event(struct perf_event *event); + +static inline bool check_leader_group(struct perf_event *leader, int flags) +{ + return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false; +} + static inline bool is_branch_counters_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; + return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS); } static inline bool is_pebs_counter_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); +} + +static inline bool is_acr_event_group(struct perf_event *event) +{ + return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } struct amd_nb { @@ -261,6 +274,7 @@ struct cpu_hw_events { struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; int n_excl; /* the number of exclusive events */ + int n_late_setup; /* the num of events needs late setup */ unsigned int txn_flags; int is_fake; @@ -286,6 +300,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -707,6 +725,15 @@ struct x86_hybrid_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -789,6 +816,10 @@ struct x86_pmu { int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); void (*late_setup)(void); + void (*pebs_enable)(struct perf_event *event); + void (*pebs_disable)(struct perf_event *event); + void (*pebs_enable_all)(void); + void (*pebs_disable_all)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -805,6 +836,14 @@ struct x86_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { @@ -871,7 +910,7 @@ struct x86_pmu { */ unsigned int bts :1, bts_active :1, - pebs :1, + ds_pebs :1, pebs_active :1, pebs_broken :1, pebs_prec_dist :1, @@ -1042,6 +1081,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1084,6 +1124,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; @@ -1091,6 +1132,10 @@ DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); +DECLARE_STATIC_CALL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); +DECLARE_STATIC_CALL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1198,16 +1243,16 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); if (hwc->extra_reg.reg) - wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrq(hwc->extra_reg.reg, hwc->extra_reg.config); /* * Add enabled Merge event on next counter * if large increment event being enabled on this counter */ if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en); + wrmsrq(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en); - wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); + wrmsrq(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); } void x86_pmu_enable_all(int added); @@ -1223,10 +1268,10 @@ static inline void x86_pmu_disable_event(struct perf_event *event) u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config & ~disable_mask); + wrmsrq(hwc->config_base, hwc->config & ~disable_mask); if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0); + wrmsrq(x86_pmu_config_addr(hwc->idx + 1), 0); } void x86_pmu_enable_event(struct perf_event *event); @@ -1394,12 +1439,12 @@ static __always_inline void __amd_pmu_lbr_disable(void) { u64 dbg_ctl, dbg_extn_cfg; - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + rdmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + wrmsrq(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + rdmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); } } @@ -1531,21 +1576,21 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) static __always_inline void __intel_pmu_pebs_disable_all(void) { - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } static __always_inline void __intel_pmu_arch_lbr_disable(void) { - wrmsrl(MSR_ARCH_LBR_CTL, 0); + wrmsrq(MSR_ARCH_LBR_CTL, 0); } static __always_inline void __intel_pmu_lbr_disable(void) { u64 debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } int intel_pmu_save_and_restart(struct perf_event *event); @@ -1580,6 +1625,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +void intel_pmu_late_setup(void); + u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); @@ -1636,11 +1683,13 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc); + void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); -void intel_ds_init(void); +void intel_pebs_init(void); void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, struct cpu_hw_events *cpuc, diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b5..70078334e4a3 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index 600bf8d15c0c..bb719d0d3f0b 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -2,6 +2,8 @@ #include <linux/export.h> #include <linux/types.h> #include <linux/bits.h> + +#include <asm/msr.h> #include "probe.h" static umode_t @@ -43,7 +45,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ - if (rdmsrl_safe(msr[bit].msr, &val)) + if (rdmsrq_safe(msr[bit].msr, &val)) continue; mask = msr[bit].mask; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8ddace8cea96..defd86137f12 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -65,6 +65,7 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "perf_event.h" #include "probe.h" @@ -192,7 +193,7 @@ static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) static inline u64 rapl_read_counter(struct perf_event *event) { u64 raw; - rdmsrl(event->hw.event_base, raw); + rdmsrq(event->hw.event_base, raw); return raw; } @@ -221,7 +222,7 @@ static u64 rapl_event_update(struct perf_event *event) prev_raw_count = local64_read(&hwc->prev_count); do { - rdmsrl(event->hw.event_base, new_raw_count); + rdmsrq(event->hw.event_base, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); @@ -610,8 +611,8 @@ static int rapl_check_hw_unit(void) u64 msr_rapl_power_unit_bits; int i; - /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + /* protect rdmsrq() to handle virtualization */ + if (rdmsrq_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index dab4ed199227..77fd00b3305e 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -2,6 +2,7 @@ #include <asm/insn.h> #include <linux/mm.h> +#include <asm/msr.h> #include "perf_event.h" static int decode_branch_type(struct insn *insn) diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index 2fd9b0cf9a5e..4bdfcf091200 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -15,6 +15,7 @@ #include <asm/cpufeature.h> #include <asm/hardirq.h> #include <asm/apic.h> +#include <asm/msr.h> #include "../perf_event.h" @@ -254,26 +255,26 @@ static __initconst const u64 zxe_hw_cache_event_ids static void zhaoxin_pmu_disable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0); } static void zhaoxin_pmu_enable_all(int added) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } static inline u64 zhaoxin_pmu_get_status(void) { u64 status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; } static inline void zhaoxin_pmu_ack_status(u64 ack) { - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } static inline void zxc_pmu_ack_status(u64 ack) @@ -293,9 +294,9 @@ static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) mask = 0xfULL << (idx * 4); - rdmsrl(hwc->config_base, ctrl_val); + rdmsrq(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + wrmsrq(hwc->config_base, ctrl_val); } static void zhaoxin_pmu_disable_event(struct perf_event *event) @@ -329,10 +330,10 @@ static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) bits <<= (idx * 4); mask = 0xfULL << (idx * 4); - rdmsrl(hwc->config_base, ctrl_val); + rdmsrq(hwc->config_base, ctrl_val); ctrl_val &= ~mask; ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + wrmsrq(hwc->config_base, ctrl_val); } static void zhaoxin_pmu_enable_event(struct perf_event *event) @@ -397,8 +398,7 @@ again: if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, &data, regs)) - x86_pmu_stop(event, 0); + perf_event_overflow(event, &data, regs); } /* diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 6d91ac5f9836..bfde0a3498b9 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -28,6 +28,7 @@ #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/trace/hyperv.h> @@ -37,7 +38,7 @@ static u64 hv_apic_icr_read(void) { u64 reg_val; - rdmsrl(HV_X64_MSR_ICR, reg_val); + rdmsrq(HV_X64_MSR_ICR, reg_val); return reg_val; } @@ -49,7 +50,7 @@ static void hv_apic_icr_write(u32 low, u32 id) reg_val = reg_val << 32; reg_val |= low; - wrmsrl(HV_X64_MSR_ICR, reg_val); + wrmsrq(HV_X64_MSR_ICR, reg_val); } static u32 hv_apic_read(u32 reg) @@ -75,10 +76,10 @@ static void hv_apic_write(u32 reg, u32 val) { switch (reg) { case APIC_EOI: - wrmsr(HV_X64_MSR_EOI, val, 0); + wrmsrq(HV_X64_MSR_EOI, val); break; case APIC_TASKPRI: - wrmsr(HV_X64_MSR_TPR, val, 0); + wrmsrq(HV_X64_MSR_TPR, val); break; default: native_apic_mem_write(reg, val); @@ -92,7 +93,7 @@ static void hv_apic_eoi_write(void) if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) return; - wrmsr(HV_X64_MSR_EOI, APIC_EOI_ACK, 0); + wrmsrq(HV_X64_MSR_EOI, APIC_EOI_ACK); } static bool cpu_is_self(int cpu) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index ddeb40930bc8..5d27194a2efa 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -21,6 +21,7 @@ #include <asm/hypervisor.h> #include <hyperv/hvhdk.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/idtentry.h> #include <asm/set_memory.h> #include <linux/kexec.h> @@ -62,7 +63,7 @@ static int hyperv_init_ghcb(void) * returned by MSR_AMD64_SEV_ES_GHCB is above shared * memory boundary and map it here. */ - rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); + rdmsrq(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); /* Mask out vTOM bit. ioremap_cache() maps decrypted */ ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; @@ -95,7 +96,7 @@ static int hv_cpu_init(unsigned int cpu) * For root partition we get the hypervisor provided VP assist * page, instead of allocating a new page. */ - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, PAGE_SIZE, MEMREMAP_WB); } else { @@ -128,7 +129,7 @@ static int hv_cpu_init(unsigned int cpu) } if (!WARN_ON(!(*hvp))) { msr.enable = 1; - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } return hyperv_init_ghcb(); @@ -140,7 +141,7 @@ static void hv_reenlightenment_notify(struct work_struct *dummy) { struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); /* Don't issue the callback if TSC accesses are not emulated */ if (hv_reenlightenment_cb && emu_status.inprogress) @@ -153,11 +154,11 @@ void hyperv_stop_tsc_emulation(void) u64 freq; struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); emu_status.inprogress = 0; - wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + wrmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); tsc_khz = div64_u64(freq, 1000); } EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); @@ -203,8 +204,8 @@ void set_hv_tscchange_cb(void (*cb)(void)) re_ctrl.target_vp = hv_vp_index[get_cpu()]; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); - wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrq(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); put_cpu(); } @@ -217,9 +218,9 @@ void clear_hv_tscchange_cb(void) if (!hv_reenlightenment_available()) return; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); re_ctrl.enabled = 0; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); hv_reenlightenment_cb = NULL; } @@ -251,16 +252,16 @@ static int hv_cpu_die(unsigned int cpu) */ memunmap(hv_vp_assist_page[cpu]); hv_vp_assist_page[cpu] = NULL; - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); msr.enable = 0; } - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } if (hv_reenlightenment_cb == NULL) return 0; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); if (re_ctrl.target_vp == hv_vp_index[cpu]) { /* * Reassign reenlightenment notifications to some other online @@ -274,7 +275,7 @@ static int hv_cpu_die(unsigned int cpu) else re_ctrl.enabled = 0; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); } return 0; @@ -331,9 +332,9 @@ static int hv_suspend(void) hv_hypercall_pg = NULL; /* Disable the hypercall page in the hypervisor */ - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 0; - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); ret = hv_cpu_die(0); return ret; @@ -348,11 +349,11 @@ static void hv_resume(void) WARN_ON(ret); /* Re-enable the hypercall page */ - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg_saved); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hv_hypercall_pg = hv_hypercall_pg_saved; hv_hypercall_pg_saved = NULL; @@ -499,7 +500,7 @@ void __init hyperv_init(void) * in such a VM and is only used in such a VM. */ guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); - wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); @@ -515,7 +516,7 @@ void __init hyperv_init(void) if (hv_hypercall_pg == NULL) goto clean_guest_os_id; - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; if (hv_root_partition()) { @@ -532,7 +533,7 @@ void __init hyperv_init(void) * so it is populated with code, then copy the code to an * executable page. */ - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); pg = vmalloc_to_page(hv_hypercall_pg); src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, @@ -544,7 +545,7 @@ void __init hyperv_init(void) hv_remap_tsc_clocksource(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } skip_hypercall_pg_init: @@ -608,7 +609,7 @@ skip_hypercall_pg_init: return; clean_guest_os_id: - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); free_ghcb_page: @@ -629,7 +630,7 @@ void hyperv_cleanup(void) union hv_reference_tsc_msr tsc_msr; /* Reset our OS id */ - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* @@ -667,18 +668,18 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) return; panic_reported = true; - rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + rdmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P0, err); - wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); - wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); - wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); + wrmsrq(HV_X64_MSR_CRASH_P0, err); + wrmsrq(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrq(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrq(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrq(HV_X64_MSR_CRASH_P4, regs->sp); /* * Let Hyper-V know there is crash data available */ - wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); + wrmsrq(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); } EXPORT_SYMBOL_GPL(hyperv_report_panic); @@ -701,7 +702,7 @@ bool hv_is_hyperv_initialized(void) * that the hypercall page is setup */ hypercall_msr.as_uint64 = 0; - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); return hypercall_msr.enable; } diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 151e851bef09..81b006601370 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -15,6 +15,7 @@ #include <asm/mshyperv.h> #include <asm/paravirt.h> #include <asm/apic.h> +#include <asm/msr.h> static bool hv_pvspin __initdata = true; @@ -39,18 +40,18 @@ static void hv_qlock_wait(u8 *byte, u8 val) * To prevent a race against the unlock path it is required to * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between - * the lock value check and the rdmsrl() then the vCPU might be put + * the lock value check and the rdmsrq() then the vCPU might be put * into 'idle' state by the hypervisor and kept in that state for * an unspecified amount of time. */ local_irq_save(flags); /* - * Only issue the rdmsrl() when the lock state has not changed. + * Only issue the rdmsrq() when the lock state has not changed. */ if (READ_ONCE(*byte) == val) { unsigned long msr_val; - rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + rdmsrq(HV_X64_MSR_GUEST_IDLE, msr_val); (void)msr_val; } diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 13242ed8ff16..4580936dcb03 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -11,6 +11,7 @@ #include <asm/desc.h> #include <asm/i8259.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/realmode.h> #include <asm/reboot.h> #include <../kernel/smpboot.h> @@ -149,11 +150,11 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) input->vp_context.rip = rip; input->vp_context.rsp = rsp; input->vp_context.rflags = 0x0000000000000002; - input->vp_context.efer = __rdmsr(MSR_EFER); + input->vp_context.efer = native_rdmsrq(MSR_EFER); input->vp_context.cr0 = native_read_cr0(); input->vp_context.cr3 = __native_read_cr3(); input->vp_context.cr4 = native_read_cr4(); - input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT); + input->vp_context.msr_cr_pat = native_rdmsrq(MSR_IA32_CR_PAT); input->vp_context.idtr.limit = idt_ptr.size; input->vp_context.idtr.base = idt_ptr.address; input->vp_context.gdtr.limit = gdt_ptr.size; diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 77bf05f06b9e..09a165a3c41e 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -22,6 +22,7 @@ #include <asm/realmode.h> #include <asm/e820/api.h> #include <asm/desc.h> +#include <asm/msr.h> #include <uapi/asm/vmx.h> #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -110,12 +111,12 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) static inline u64 rd_ghcb_msr(void) { - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); + return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB); } static inline void wr_ghcb_msr(u64 val) { - native_wrmsrl(MSR_AMD64_SEV_ES_GHCB, val); + native_wrmsrq(MSR_AMD64_SEV_ES_GHCB, val); } static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h index 1dd14381bcb6..fab11192c60a 100644 --- a/arch/x86/include/asm/acrn.h +++ b/arch/x86/include/asm/acrn.h @@ -25,7 +25,7 @@ void acrn_remove_intr_handler(void); static inline u32 acrn_cpuid_base(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("ACRNACRNACRN", 0); + return cpuid_base_hypervisor("ACRNACRNACRN", 0); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a37a8bd87fd..15bc07a5ebb3 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -6,6 +6,7 @@ #include <linux/stringify.h> #include <linux/objtool.h> #include <asm/asm.h> +#include <asm/bug.h> #define ALT_FLAGS_SHIFT 16 @@ -82,6 +83,12 @@ struct alt_instr { extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -124,6 +131,37 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, } #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_init_mod(struct module *mod); +extern void its_fini_mod(struct module *mod); +extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); +#else /* CONFIG_MITIGATION_ITS */ +static inline void its_init_mod(struct module *mod) { } +static inline void its_fini_mod(struct module *mod) { } +static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} +#endif + +#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) +extern bool cpu_wants_rethunk(void); +extern bool cpu_wants_rethunk_at(void *addr); +#else +static __always_inline bool cpu_wants_rethunk(void) +{ + return false; +} +static __always_inline bool cpu_wants_rethunk_at(void *addr) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -335,11 +373,6 @@ void nop_func(void); __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f -#define new_len3 146f-145f - /* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h new file mode 100644 index 000000000000..2cf5153edbc2 --- /dev/null +++ b/arch/x86/include/asm/amd/fch.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_FCH_H_ +#define _ASM_X86_AMD_FCH_H_ + +#define FCH_PM_BASE 0xFED80300 + +/* Register offsets from PM base: */ +#define FCH_PM_DECODEEN 0x00 +#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19) +#define FCH_PM_SCRATCH 0x80 +#define FCH_PM_S5_RESET_STATUS 0xC0 + +#endif /* _ASM_X86_AMD_FCH_H_ */ diff --git a/arch/x86/include/asm/amd_hsmp.h b/arch/x86/include/asm/amd/hsmp.h index 03c2ce3edaf5..2137f62853ed 100644 --- a/arch/x86/include/asm/amd_hsmp.h +++ b/arch/x86/include/asm/amd/hsmp.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef _ASM_X86_AMD_HSMP_H_ #define _ASM_X86_AMD_HSMP_H_ @@ -13,4 +12,5 @@ static inline int hsmp_send_message(struct hsmp_message *msg) return -ENODEV; } #endif + #endif /*_ASM_X86_AMD_HSMP_H_*/ diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd/ibs.h index 77f3a589a99a..3ee5903982c2 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd/ibs.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_IBS_H +#define _ASM_X86_AMD_IBS_H + /* * From PPR Vol 1 for AMD Family 19h Model 01h B1 * 55898 Rev 0.35 - Feb 5, 2021 @@ -151,3 +154,5 @@ struct perf_ibs_data { }; u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; + +#endif /* _ASM_X86_AMD_IBS_H */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd/nb.h index adfa0854cf2d..ddb5108cf46c 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd/nb.h @@ -4,7 +4,7 @@ #include <linux/ioport.h> #include <linux/pci.h> -#include <asm/amd_node.h> +#include <asm/amd/node.h> struct amd_nb_bus_dev_range { u8 bus; diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd/node.h index 23fe617898a8..23fe617898a8 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd/node.h diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c903d358405d..68e10e30fe9b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -120,7 +120,7 @@ static inline bool apic_is_x2apic_enabled(void) { u64 msr; - if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) + if (rdmsrq_safe(MSR_IA32_APICBASE, &msr)) return false; return msr & X2APIC_ENABLE; } @@ -209,12 +209,12 @@ static inline void native_apic_msr_write(u32 reg, u32 v) reg == APIC_LVR) return; - wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); + wrmsrq(APIC_BASE_MSR + (reg >> 4), v); } static inline void native_apic_msr_eoi(void) { - __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + native_wrmsrq(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK); } static inline u32 native_apic_msr_read(u32 reg) @@ -224,20 +224,20 @@ static inline u32 native_apic_msr_read(u32 reg) if (reg == APIC_DFR) return -1; - rdmsrl(APIC_BASE_MSR + (reg >> 4), msr); + rdmsrq(APIC_BASE_MSR + (reg >> 4), msr); return (u32)msr; } static inline void native_x2apic_icr_write(u32 low, u32 id) { - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); + wrmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } static inline u64 native_x2apic_icr_read(void) { unsigned long val; - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + rdmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), val); return val; } diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index cbc6157f0b4b..b5982b94bdba 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -16,8 +16,7 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; - asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE - "call __sw_hweight32", + asm_inline (ALTERNATIVE("call __sw_hweight32", "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); @@ -46,8 +45,7 @@ static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; - asm_inline (ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE - "call __sw_hweight64", + asm_inline (ALTERNATIVE("call __sw_hweight64", "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT) : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT : [val] REG_IN (w)); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index cc2881576c2c..f963848024a5 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -114,17 +114,12 @@ #endif #ifndef __ASSEMBLER__ -#ifndef __pic__ static __always_inline __pure void *rip_rel_ptr(void *p) { asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p)); return p; } -#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var))) -#else -#define RIP_REL_REF(var) (var) -#endif #endif /* @@ -243,5 +238,24 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) +/* + * Both i386 and x86_64 returns 64-bit values in edx:eax for certain + * instructions, but GCC's "A" constraint has different meanings. + * For i386, "A" means exactly edx:eax, while for x86_64 it + * means rax *or* rdx. + * + * These helpers wrapping these semantic differences save one instruction + * clearing the high half of 'low': + */ +#ifdef CONFIG_X86_64 +# define EAX_EDX_DECLARE_ARGS(val, low, high) unsigned long low, high +# define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) +# define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) +#else +# define EAX_EDX_DECLARE_ARGS(val, low, high) u64 val +# define EAX_EDX_VAL(val, low, high) (val) +# define EAX_EDX_RET(val, low, high) "=A" (val) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 100413aff640..eebbc8889e70 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -248,7 +248,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) static __always_inline unsigned long variable__ffs(unsigned long word) { - asm("rep; bsf %1,%0" + asm("tzcnt %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; @@ -267,10 +267,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) static __always_inline unsigned long variable_ffz(unsigned long word) { - asm("rep; bsf %1,%0" - : "=r" (word) - : "r" (~word)); - return word; + return variable__ffs(~word); } /** diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 3f02ff6d333d..02b23aa78955 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -74,6 +74,11 @@ # define BOOT_STACK_SIZE 0x1000 #endif +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 + #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; @@ -83,6 +88,11 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, void (*error)(char *x)); extern struct boot_params *boot_params_ptr; +extern unsigned long *trampoline_32bit; +extern const u16 trampoline_ljmp_imm_offset; + +void trampoline_32bit_src(void *trampoline, bool enable_5lvl); + #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index e7225452963f..e1dbf8df1b69 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -22,7 +22,7 @@ static inline u64 cc_get_mask(void) static inline void cc_set_mask(u64 mask) { - RIP_REL_REF(cc_mask) = mask; + cc_mask = mask; } u64 cc_mkenc(u64 val); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67..5b50e0e35129 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -75,7 +75,7 @@ #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ #define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ #define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ -/* Free ( 3*32+ 6) */ +#define X86_FEATURE_ZEN6 ( 3*32+ 6) /* CPU based on Zen6 microarchitecture */ /* Free ( 3*32+ 7) */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ #define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ @@ -476,11 +476,13 @@ #define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ -#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ -#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */ -#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ -#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ -#define X86_FEATURE_PREFER_YMM (21*32 + 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_CLEAR_BHB_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_AMD_FAST_CPPC (21*32+ 5) /* Fast CPPC */ +#define X86_FEATURE_AMD_HTR_CORES (21*32+ 6) /* Heterogeneous Core Topology */ +#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32+ 7) /* Workload Classification */ +#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) @@ -519,7 +521,7 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +/* unused, was #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ #define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ @@ -527,10 +529,14 @@ #define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ -#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */ -#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ -#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ -#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ -#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ -#define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ +#define X86_BUG_SRSO X86_BUG( 1*32+ 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG( 1*32+ 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG( 1*32+ 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG( 1*32+ 3) /* "bhi" CPU is affected by Branch History Injection */ +#define X86_BUG_IBPB_NO_RET X86_BUG( 1*32+ 4) /* "ibpb_no_ret" IBPB omits return target predictions */ +#define X86_BUG_SPECTRE_V2_USER X86_BUG( 1*32+ 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ +#define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ +#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuid/api.h b/arch/x86/include/asm/cpuid/api.h index 9c180c9cc58e..44fa82e1267c 100644 --- a/arch/x86/include/asm/cpuid/api.h +++ b/arch/x86/include/asm/cpuid/api.h @@ -14,9 +14,9 @@ */ #ifdef CONFIG_X86_32 -bool have_cpuid_p(void); +bool cpuid_feature(void); #else -static inline bool have_cpuid_p(void) +static inline bool cpuid_feature(void) { return true; } @@ -36,9 +36,9 @@ static inline void native_cpuid(u32 *eax, u32 *ebx, } #define NATIVE_CPUID_REG(reg) \ -static inline u32 native_cpuid_##reg(u32 op) \ +static inline u32 native_cpuid_##reg(u32 op) \ { \ - u32 eax = op, ebx, ecx = 0, edx; \ + u32 eax = op, ebx, ecx = 0, edx; \ \ native_cpuid(&eax, &ebx, &ecx, &edx); \ \ @@ -160,6 +160,10 @@ static inline void __cpuid_read_reg(u32 leaf, u32 subleaf, __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ } +/* + * Hypervisor-related APIs: + */ + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { @@ -184,14 +188,14 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) return false; } -#define for_each_possible_hypervisor_cpuid_base(function) \ +#define for_each_possible_cpuid_base_hypervisor(function) \ for (function = 0x40000000; function < 0x40010000; function += 0x100) -static inline u32 hypervisor_cpuid_base(const char *sig, u32 leaves) +static inline u32 cpuid_base_hypervisor(const char *sig, u32 leaves) { u32 base, eax, signature[3]; - for_each_possible_hypervisor_cpuid_base(base) { + for_each_possible_cpuid_base_hypervisor(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); /* @@ -207,4 +211,82 @@ static inline u32 hypervisor_cpuid_base(const char *sig, u32 leaves) return 0; } +/* + * CPUID(0x2) parsing: + */ + +/** + * cpuid_leaf_0x2() - Return sanitized CPUID(0x2) register output + * @regs: Output parameter + * + * Query CPUID(0x2) and store its output in @regs. Force set any + * invalid 1-byte descriptor returned by the hardware to zero (the NULL + * cache/TLB descriptor) before returning it to the caller. + * + * Use for_each_cpuid_0x2_desc() to iterate over the register output in + * parsed form. + */ +static inline void cpuid_leaf_0x2(union leaf_0x2_regs *regs) +{ + cpuid_leaf(0x2, regs); + + /* + * All Intel CPUs must report an iteration count of 1. In case + * of bogus hardware, treat all returned descriptors as NULL. + */ + if (regs->desc[0] != 0x01) { + for (int i = 0; i < 4; i++) + regs->regv[i] = 0; + return; + } + + /* + * The most significant bit (MSB) of each register must be clear. + * If a register is invalid, replace its descriptors with NULL. + */ + for (int i = 0; i < 4; i++) { + if (regs->reg[i].invalid) + regs->regv[i] = 0; + } +} + +/** + * for_each_cpuid_0x2_desc() - Iterator for parsed CPUID(0x2) descriptors + * @_regs: CPUID(0x2) register output, as returned by cpuid_leaf_0x2() + * @_ptr: u8 pointer, for macro internal use only + * @_desc: Pointer to the parsed CPUID(0x2) descriptor at each iteration + * + * Loop over the 1-byte descriptors in the passed CPUID(0x2) output registers + * @_regs. Provide the parsed information for each descriptor through @_desc. + * + * To handle cache-specific descriptors, switch on @_desc->c_type. For TLB + * descriptors, switch on @_desc->t_type. + * + * Example usage for cache descriptors:: + * + * const struct leaf_0x2_table *desc; + * union leaf_0x2_regs regs; + * u8 *ptr; + * + * cpuid_leaf_0x2(®s); + * for_each_cpuid_0x2_desc(regs, ptr, desc) { + * switch (desc->c_type) { + * ... + * } + * } + */ +#define for_each_cpuid_0x2_desc(_regs, _ptr, _desc) \ + for (_ptr = &(_regs).desc[1]; \ + _ptr < &(_regs).desc[16] && (_desc = &cpuid_0x2_table[*_ptr]); \ + _ptr++) + +/* + * CPUID(0x80000006) parsing: + */ + +static inline bool cpuid_amd_hygon_has_l3_cache(void) +{ + return cpuid_edx(0x80000006); +} + #endif /* _ASM_X86_CPUID_API_H */ diff --git a/arch/x86/include/asm/cpuid/types.h b/arch/x86/include/asm/cpuid/types.h index 8582e27e836d..8a00364b79de 100644 --- a/arch/x86/include/asm/cpuid/types.h +++ b/arch/x86/include/asm/cpuid/types.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_CPUID_TYPES_H #define _ASM_X86_CPUID_TYPES_H +#include <linux/build_bug.h> #include <linux/types.h> /* @@ -29,4 +30,98 @@ enum cpuid_regs_idx { #define CPUID_LEAF_FREQ 0x16 #define CPUID_LEAF_TILE 0x1d +/* + * Types for CPUID(0x2) parsing: + */ + +struct leaf_0x2_reg { + u32 : 31, + invalid : 1; +}; + +union leaf_0x2_regs { + struct leaf_0x2_reg reg[4]; + u32 regv[4]; + u8 desc[16]; +}; + +/* + * Leaf 0x2 1-byte descriptors' cache types + * To be used for their mappings at cpuid_0x2_table[] + * + * Start at 1 since type 0 is reserved for HW byte descriptors which are + * not recognized by the kernel; i.e., those without an explicit mapping. + */ +enum _cache_table_type { + CACHE_L1_INST = 1, + CACHE_L1_DATA, + CACHE_L2, + CACHE_L3 + /* Adjust __TLB_TABLE_TYPE_BEGIN before adding more types */ +} __packed; +#ifndef __CHECKER__ +static_assert(sizeof(enum _cache_table_type) == 1); +#endif + +/* + * Ensure that leaf 0x2 cache and TLB type values do not intersect, + * since they share the same type field at struct cpuid_0x2_table. + */ +#define __TLB_TABLE_TYPE_BEGIN (CACHE_L3 + 1) + +/* + * Leaf 0x2 1-byte descriptors' TLB types + * To be used for their mappings at cpuid_0x2_table[] + */ +enum _tlb_table_type { + TLB_INST_4K = __TLB_TABLE_TYPE_BEGIN, + TLB_INST_4M, + TLB_INST_2M_4M, + TLB_INST_ALL, + + TLB_DATA_4K, + TLB_DATA_4M, + TLB_DATA_2M_4M, + TLB_DATA_4K_4M, + TLB_DATA_1G, + TLB_DATA_1G_2M_4M, + + TLB_DATA0_4K, + TLB_DATA0_4M, + TLB_DATA0_2M_4M, + + STLB_4K, + STLB_4K_2M, +} __packed; +#ifndef __CHECKER__ +static_assert(sizeof(enum _tlb_table_type) == 1); +#endif + +/* + * Combined parsing table for leaf 0x2 cache and TLB descriptors. + */ + +struct leaf_0x2_table { + union { + enum _cache_table_type c_type; + enum _tlb_table_type t_type; + }; + union { + short c_size; + short entries; + }; +}; + +extern const struct leaf_0x2_table cpuid_0x2_table[256]; + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of entries + * for their respective TLB types. TLB descriptor 0x63 is an exception: it + * implies 4 dTLB entries for 1GB pages and 32 dTLB entries for 2MB or 4MB pages. + * + * Encode that descriptor's dTLB entry count for 2MB/4MB pages here, as the entry + * count for dTLB 1GB pages is already encoded at the cpuid_0x2_table[]'s mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 + #endif /* _ASM_X86_CPUID_TYPES_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index fdbbbfec745a..363110e6b2e3 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -23,7 +23,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); static __always_inline unsigned long native_get_debugreg(int regno) { - unsigned long val = 0; /* Damn you, gcc! */ + unsigned long val; switch (regno) { case 0: @@ -43,7 +43,7 @@ static __always_inline unsigned long native_get_debugreg(int regno) break; case 7: /* - * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * Use "asm volatile" for DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception @@ -55,7 +55,7 @@ static __always_inline unsigned long native_get_debugreg(int regno) * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ - asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%db7, %0" : "=r" (val)); break; default: BUG(); @@ -83,15 +83,15 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) break; case 7: /* - * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * Use "asm volatile" for DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the - * __FORCE_ORDER here too to avoid similar problems in the + * "asm volatile" here too to avoid similar problems in the * future. */ - asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); + asm volatile("mov %0, %%db7" ::"r" (value)); break; default: BUG(); @@ -169,7 +169,7 @@ static inline unsigned long get_debugctlmsr(void) if (boot_cpu_data.x86 < 6) return 0; #endif - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } @@ -180,7 +180,7 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) if (boot_cpu_data.x86 < 6) return; #endif - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 128602612eca..6c8fdc96be7e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -76,12 +76,8 @@ typedef struct user_i387_struct elf_fpregset_t; #include <asm/vdso.h> -#ifdef CONFIG_X86_64 extern unsigned int vdso64_enabled; -#endif -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern unsigned int vdso32_enabled; -#endif /* * This is used to ensure we don't load something for the wrong architecture. diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 77d20555e04d..d535a97c7284 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -53,7 +53,6 @@ static inline void arch_exit_work(unsigned long ti_work) if (unlikely(ti_work & _TIF_IO_BITMAP)) tss_update_io_bitmap(); - fpregs_assert_state_consistent(); if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); } @@ -61,7 +60,9 @@ static inline void arch_exit_work(unsigned long ti_work) static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { - if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) + fpregs_assert_state_consistent(); + + if (unlikely(ti_work)) arch_exit_work(ti_work); fred_update_rsp0(); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f42de5f05e7e..cd6f194a912b 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -126,6 +126,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif /* State tracking */ +DECLARE_PER_CPU(bool, kernel_fpu_allowed); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* Process cleanup */ @@ -136,7 +137,7 @@ static inline void fpstate_free(struct fpu *fpu) { } #endif /* fpstate-related functions which are exported to KVM */ -extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); +extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature); extern u64 xstate_get_guest_group_perm(void); diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c485f1944c5f..c060549c6c94 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -10,7 +10,7 @@ #include <asm/trace/fpu.h> extern void save_fpregs_to_fpstate(struct fpu *fpu); -extern void fpu__drop(struct fpu *fpu); +extern void fpu__drop(struct task_struct *tsk); extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long shstk_addr); extern void fpu_flush_thread(void); @@ -18,31 +18,25 @@ extern void fpu_flush_thread(void); /* * FPU state switching for scheduling. * - * This is a two-stage process: + * switch_fpu() saves the old state and sets TIF_NEED_FPU_LOAD if + * TIF_NEED_FPU_LOAD is not set. This is done within the context + * of the old process. * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the + * Once TIF_NEED_FPU_LOAD is set, it is required to load the * registers before returning to userland or using the content * otherwise. * * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct task_struct *old, int cpu) +static inline void switch_fpu(struct task_struct *old, int cpu) { - if (cpu_feature_enabled(X86_FEATURE_FPU) && + if (!test_tsk_thread_flag(old, TIF_NEED_FPU_LOAD) && + cpu_feature_enabled(X86_FEATURE_FPU) && !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { - struct fpu *old_fpu = &old->thread.fpu; + struct fpu *old_fpu = x86_task_fpu(old); + set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -50,7 +44,7 @@ static inline void switch_fpu_prepare(struct task_struct *old, int cpu) * current CPU number in @old_fpu, so the next return * to user space can avoid the FPU register restore * when is returns on the same CPU and still owns the - * context. + * context. See fpregs_restore_userregs(). */ old_fpu->last_cpu = cpu; @@ -58,14 +52,4 @@ static inline void switch_fpu_prepare(struct task_struct *old, int cpu) } } -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(struct task_struct *new) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); -} - #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index de16862bf230..1c94121acd3d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -125,6 +125,7 @@ enum xfeature { XFEATURE_RSRVD_COMP_16, XFEATURE_XTILE_CFG, XFEATURE_XTILE_DATA, + XFEATURE_APX, XFEATURE_MAX, }; @@ -145,6 +146,7 @@ enum xfeature { #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) +#define XFEATURE_MASK_APX (1 << XFEATURE_APX) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ @@ -304,6 +306,13 @@ struct xtile_data { } __packed; /* + * State component 19: 8B extended general purpose register. + */ +struct apx_state { + u64 egpr[16]; +} __packed; + +/* * State component 10 is supervisor state used for context-switching the * PASID state. */ @@ -407,9 +416,11 @@ struct fpu_state_perm { /* * @__state_perm: * - * This bitmap indicates the permission for state components, which - * are available to a thread group. The permission prctl() sets the - * enabled state bits in thread_group_leader()->thread.fpu. + * This bitmap indicates the permission for state components + * available to a thread group, including both user and supervisor + * components and software-defined bits like FPU_GUEST_PERM_LOCKED. + * The permission prctl() sets the enabled state bits in + * thread_group_leader()->thread.fpu. * * All run time operations use the per thread information in the * currently active fpu.fpstate which contains the xfeature masks @@ -525,13 +536,6 @@ struct fpu_guest { u64 xfeatures; /* - * @perm: xfeature bitmap of features which are - * permitted to be enabled for the guest - * vCPU. - */ - u64 perm; - - /* * @xfd_err: Save the guest value. */ u64 xfd_err; diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7f39fe7980c5..b308a76afbb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -32,7 +32,8 @@ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * Features which are restored when returning to user space. diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 2a29e5216881..12b34d5b2953 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -9,6 +9,7 @@ #include <linux/const.h> #include <asm/asm.h> +#include <asm/msr.h> #include <asm/trapnr.h> /* diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 02f239569b93..ab2547f97c2c 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -60,7 +60,7 @@ static inline unsigned long x86_fsbase_read_cpu(void) if (boot_cpu_has(X86_FEATURE_FSGSBASE)) fsbase = rdfsbase(); else - rdmsrl(MSR_FS_BASE, fsbase); + rdmsrq(MSR_FS_BASE, fsbase); return fsbase; } @@ -70,7 +70,7 @@ static inline void x86_fsbase_write_cpu(unsigned long fsbase) if (boot_cpu_has(X86_FEATURE_FSGSBASE)) wrfsbase(fsbase); else - wrmsrl(MSR_FS_BASE, fsbase); + wrmsrq(MSR_FS_BASE, fsbase); } extern unsigned long x86_gsbase_read_cpu_inactive(void); diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 53e4015242b4..97f341777db5 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -82,6 +82,7 @@ #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) #define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) #define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) +#define INAT_INV64 (1 << (INAT_FLAG_OFFS + 11)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -242,4 +243,9 @@ static inline int inat_evex_scalable(insn_attr_t attr) { return attr & INAT_EVEX_SCALABLE; } + +static inline int inat_is_invalid64(insn_attr_t attr) +{ + return attr & INAT_INV64; +} #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 3a97a7eefb51..be10c188614f 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ + /* "Hybrid" Processors (P-Core/E-Core) */ #define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e889c3bab5a2..ca309a3227c7 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -217,7 +217,7 @@ void memset_io(volatile void __iomem *, int, size_t); static inline void __iowrite32_copy(void __iomem *to, const void *from, size_t count) { - asm volatile("rep ; movsl" + asm volatile("rep movsl" : "=&c"(count), "=&D"(to), "=&S"(from) : "0"(count), "1"(to), "2"(from) : "memory"); @@ -282,7 +282,7 @@ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; outs" #bwl \ + asm volatile("rep outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ @@ -298,7 +298,7 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; ins" #bwl \ + asm volatile("rep ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5432457d2338..f2ad77929d6e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -8,6 +8,9 @@ # define PA_PGD 2 # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +#else +/* Size of each exception handler referenced by the IDT */ +# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ #endif # define KEXEC_CONTROL_PAGE_SIZE 4096 @@ -59,6 +62,10 @@ struct kimage; extern unsigned long kexec_va_control_page; extern unsigned long kexec_pa_table_page; extern unsigned long kexec_pa_swap_page; +extern gate_desc kexec_debug_idt[]; +extern unsigned char kexec_debug_exc_vectors[]; +extern uint16_t kexec_debug_8250_port; +extern unsigned long kexec_debug_8250_mmio32; #endif /* diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a884ab544335..9c971f846108 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -34,7 +34,9 @@ #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/asm.h> +#include <asm/irq_remapping.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/reboot.h> @@ -1472,8 +1474,13 @@ struct kvm_arch { struct once nx_once; #ifdef CONFIG_X86_64 - /* The number of TDP MMU pages across all roots. */ +#ifdef CONFIG_KVM_PROVE_MMU + /* + * The number of TDP MMU pages across all roots. Used only to sanity + * check that KVM isn't leaking TDP MMU pages. + */ atomic64_t tdp_mmu_pages; +#endif /* * List of struct kvm_mmu_pages being used as roots. @@ -2272,7 +2279,7 @@ static inline unsigned long read_msr(unsigned long msr) { u64 value; - rdmsrl(msr, value); + rdmsrq(msr, value); return value; } #endif @@ -2418,4 +2425,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); */ #define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) +static inline bool kvm_arch_has_irq_bypass(void) +{ + return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); +} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index b51d8a4673f5..9d38ae744a2e 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -141,5 +141,15 @@ #define SYM_FUNC_START_WEAK_NOALIGN(name) \ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) +/* + * Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an + * alias prefixed with __pi_ + */ +#ifdef __ASSEMBLER__ +#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_L_GLOBAL) +#else +#define SYM_PIC_ALIAS(sym) extern typeof(sym) __PASTE(__pi_, sym) __alias(sym) +#endif + #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 1530ee301dfe..ea6494628cb0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -61,7 +61,7 @@ void __init sev_es_init_vc_handling(void); static inline u64 sme_get_me_mask(void) { - return RIP_REL_REF(sme_me_mask); + return sme_me_mask; } #define __bss_decrypted __section(".bss..decrypted") diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 695e569159c1..8b41f26f003b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#include <asm/msr.h> + struct cpu_signature { unsigned int sig; unsigned int pf; @@ -17,10 +19,12 @@ struct ucode_cpu_info { void load_ucode_bsp(void); void load_ucode_ap(void); void microcode_bsp_resume(void); +bool __init microcode_loader_disabled(void); #else static inline void load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void microcode_bsp_resume(void) { } +static inline bool __init microcode_loader_disabled(void) { return false; } #endif extern unsigned long initrd_start_early; @@ -61,7 +65,7 @@ static inline u32 intel_get_microcode_revision(void) { u32 rev, dummy; - native_wrmsrl(MSR_IA32_UCODE_REV, 0); + native_wrmsrq(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ native_cpuid_eax(1); diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 8b8055a8eb9e..0fe9c569d171 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -16,6 +16,8 @@ #define MM_CONTEXT_LOCK_LAM 2 /* Allow LAM and SVA coexisting */ #define MM_CONTEXT_FORCE_TAGGED_SVA 3 +/* Tracks mm_cpumask */ +#define MM_CONTEXT_NOTRACK 4 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -44,9 +46,7 @@ typedef struct { struct ldt_struct *ldt; #endif -#ifdef CONFIG_X86_64 unsigned long flags; -#endif #ifdef CONFIG_ADDRESS_MASKING /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2398058b6e83..73bf3b1b44e8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -190,7 +190,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ - switch_mm((prev), (next), NULL); \ + switch_mm_irqs_off((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 @@ -247,6 +247,16 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif +static inline bool is_notrack_mm(struct mm_struct *mm) +{ + return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + +static inline void set_notrack_mm(struct mm_struct *mm) +{ + set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -272,4 +282,7 @@ unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> +extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); +extern void unuse_temporary_mm(struct mm_struct *prev_mm); + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index bab5ccfc60a7..778444310cfb 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -8,6 +8,7 @@ #include <linux/io.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> +#include <asm/msr.h> #include <hyperv/hvhdk.h> /* @@ -304,7 +305,7 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value); static __always_inline u64 hv_raw_get_msr(unsigned int reg) { - return __rdmsr(reg); + return native_rdmsrq(reg); } #else /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e6134ef2263d..b7dded3c8113 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -211,6 +211,14 @@ * VERW clears CPU Register * File. */ +#define ARCH_CAP_ITS_NO BIT_ULL(62) /* + * Not susceptible to + * Indirect Target Selection. + * This bit is not set by + * HW, but is synthesized by + * VMMs for guests to know + * their affected status. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -525,7 +533,7 @@ #define MSR_HWP_CAPABILITIES 0x00000771 #define MSR_HWP_REQUEST_PKG 0x00000772 #define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_REQUEST 0x00000774 #define MSR_HWP_STATUS 0x00000777 /* CPUID.6.EAX */ @@ -542,16 +550,16 @@ #define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24) #define HWP_EPP_PERFORMANCE 0x00 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF -#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) +#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -594,7 +602,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 9397a319d165..4096b8af4ba7 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -12,6 +12,7 @@ #include <uapi/asm/msr.h> #include <asm/shared/msr.h> +#include <linux/types.h> #include <linux/percpu.h> struct msr_info { @@ -37,23 +38,6 @@ struct saved_msrs { }; /* - * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" - * constraint has different meanings. For i386, "A" means exactly - * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, - * it means rax *or* rdx. - */ -#ifdef CONFIG_X86_64 -/* Using 64-bit values saves one instruction clearing the high half of low */ -#define DECLARE_ARGS(val, low, high) unsigned long low, high -#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) -#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) -#else -#define DECLARE_ARGS(val, low, high) unsigned long long val -#define EAX_EDX_VAL(val, low, high) (val) -#define EAX_EDX_RET(val, low, high) "=A" (val) -#endif - -/* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> @@ -63,13 +47,13 @@ struct saved_msrs { DECLARE_TRACEPOINT(read_msr); DECLARE_TRACEPOINT(write_msr); DECLARE_TRACEPOINT(rdpmc); -extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); -extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); -extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); +extern void do_trace_write_msr(u32 msr, u64 val, int failed); +extern void do_trace_read_msr(u32 msr, u64 val, int failed); +extern void do_trace_rdpmc(u32 msr, u64 val, int failed); #else -static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} -static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} -static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_write_msr(u32 msr, u64 val, int failed) {} +static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {} #endif /* @@ -79,9 +63,9 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ -static __always_inline unsigned long long __rdmsr(unsigned int msr) +static __always_inline u64 __rdmsr(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr\n" "2:\n" @@ -91,12 +75,12 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr) return EAX_EDX_VAL(val, low, high); } -static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) +static __always_inline void __wrmsrq(u32 msr, u64 val) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) - : : "c" (msr), "a"(low), "d" (high) : "memory"); + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory"); } #define native_rdmsr(msr, val1, val2) \ @@ -106,16 +90,20 @@ do { \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) +static __always_inline u64 native_rdmsrq(u32 msr) +{ + return __rdmsr(msr); +} + #define native_wrmsr(msr, low, high) \ - __wrmsr(msr, low, high) + __wrmsrq((msr), (u64)(high) << 32 | (low)) -#define native_wrmsrl(msr, val) \ - __wrmsr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)) +#define native_wrmsrq(msr, val) \ + __wrmsrq((msr), (val)) -static inline unsigned long long native_read_msr(unsigned int msr) +static inline u64 native_read_msr(u32 msr) { - unsigned long long val; + u64 val; val = __rdmsr(msr); @@ -125,34 +113,35 @@ static inline unsigned long long native_read_msr(unsigned int msr) return val; } -static inline unsigned long long native_read_msr_safe(unsigned int msr, - int *err) +static inline int native_read_msr_safe(u32 msr, u64 *p) { - DECLARE_ARGS(val, low, high); + int err; + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) - : [err] "=r" (*err), EAX_EDX_RET(val, low, high) + : [err] "=r" (err), EAX_EDX_RET(val, low, high) : "c" (msr)); if (tracepoint_enabled(read_msr)) - do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); - return EAX_EDX_VAL(val, low, high); + do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err); + + *p = EAX_EDX_VAL(val, low, high); + + return err; } /* Can be uninlined because referenced by paravirt */ -static inline void notrace -native_write_msr(unsigned int msr, u32 low, u32 high) +static inline void notrace native_write_msr(u32 msr, u64 val) { - __wrmsr(msr, low, high); + native_wrmsrq(msr, val); if (tracepoint_enabled(write_msr)) - do_trace_write_msr(msr, ((u64)high << 32 | low), 0); + do_trace_write_msr(msr, val, 0); } /* Can be uninlined because referenced by paravirt */ -static inline int notrace -native_write_msr_safe(unsigned int msr, u32 low, u32 high) +static inline int notrace native_write_msr_safe(u32 msr, u64 val) { int err; @@ -160,73 +149,19 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high) "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err]) : [err] "=a" (err) - : "c" (msr), "0" (low), "d" (high) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32)) : "memory"); if (tracepoint_enabled(write_msr)) - do_trace_write_msr(msr, ((u64)high << 32 | low), err); + do_trace_write_msr(msr, val, err); return err; } extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -/** - * rdtsc() - returns the current TSC without ordering constraints - * - * rdtsc() returns the result of RDTSC as a 64-bit integer. The - * only ordering constraint it supplies is the ordering implied by - * "asm volatile": it will put the RDTSC in the place you expect. The - * CPU can and will speculatively execute that RDTSC, though, so the - * results can be non-monotonic if compared on different CPUs. - */ -static __always_inline unsigned long long rdtsc(void) +static inline u64 native_read_pmc(int counter) { - DECLARE_ARGS(val, low, high); - - asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); - - return EAX_EDX_VAL(val, low, high); -} - -/** - * rdtsc_ordered() - read the current TSC in program order - * - * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. - * It is ordered like a load to a global in-memory counter. It should - * be impossible to observe non-monotonic rdtsc_unordered() behavior - * across multiple CPUs as long as the TSC is synced. - */ -static __always_inline unsigned long long rdtsc_ordered(void) -{ - DECLARE_ARGS(val, low, high); - - /* - * The RDTSC instruction is not ordered relative to memory - * access. The Intel SDM and the AMD APM are both vague on this - * point, but empirically an RDTSC instruction can be - * speculatively executed before prior loads. An RDTSC - * immediately after an appropriate barrier appears to be - * ordered as a normal load, that is, it provides the same - * ordering guarantees as reading from a global memory location - * that some other imaginary CPU is updating continuously with a - * time stamp. - * - * Thus, use the preferred barrier on the respective CPU, aiming for - * RDTSCP as the default. - */ - asm volatile(ALTERNATIVE_2("rdtsc", - "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, - "rdtscp", X86_FEATURE_RDTSCP) - : EAX_EDX_RET(val, low, high) - /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ - :: "ecx"); - - return EAX_EDX_VAL(val, low, high); -} - -static inline unsigned long long native_read_pmc(int counter) -{ - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); if (tracepoint_enabled(rdpmc)) @@ -251,51 +186,44 @@ do { \ (void)((high) = (u32)(__val >> 32)); \ } while (0) -static inline void wrmsr(unsigned int msr, u32 low, u32 high) +static inline void wrmsr(u32 msr, u32 low, u32 high) { - native_write_msr(msr, low, high); + native_write_msr(msr, (u64)high << 32 | low); } -#define rdmsrl(msr, val) \ +#define rdmsrq(msr, val) \ ((val) = native_read_msr((msr))) -static inline void wrmsrl(unsigned int msr, u64 val) +static inline void wrmsrq(u32 msr, u64 val) { - native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); + native_write_msr(msr, val); } /* wrmsr with exception handling */ -static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) +static inline int wrmsrq_safe(u32 msr, u64 val) { - return native_write_msr_safe(msr, low, high); + return native_write_msr_safe(msr, val); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, low, high) \ ({ \ - int __err; \ - u64 __val = native_read_msr_safe((msr), &__err); \ + u64 __val; \ + int __err = native_read_msr_safe((msr), &__val); \ (*low) = (u32)__val; \ (*high) = (u32)(__val >> 32); \ __err; \ }) -static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) +static inline int rdmsrq_safe(u32 msr, u64 *p) { - int err; - - *p = native_read_msr_safe(msr, &err); - return err; + return native_read_msr_safe(msr, p); } -#define rdpmc(counter, low, high) \ -do { \ - u64 _l = native_read_pmc((counter)); \ - (low) = (u32)_l; \ - (high) = (u32)(_l >> 32); \ -} while (0) - -#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) +static __always_inline u64 rdpmc(int counter) +{ + return native_read_pmc(counter); +} #endif /* !CONFIG_PARAVIRT_XXL */ @@ -315,11 +243,11 @@ static __always_inline void wrmsrns(u32 msr, u64 val) } /* - * 64-bit version of wrmsr_safe(): + * Dual u32 version of wrmsrq_safe(): */ -static inline int wrmsrl_safe(u32 msr, u64 val) +static inline int wrmsr_safe(u32 msr, u32 low, u32 high) { - return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); + return wrmsrq_safe(msr, (u64)high << 32 | low); } struct msr __percpu *msrs_alloc(void); @@ -330,14 +258,14 @@ int msr_clear_bit(u32 msr, u8 bit); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); -int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); +int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q); void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); -int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); +int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ @@ -351,14 +279,14 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); return 0; } -static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +static inline int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { - rdmsrl(msr_no, *q); + rdmsrq(msr_no, *q); return 0; } -static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +static inline int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { - wrmsrl(msr_no, q); + wrmsrq(msr_no, q); return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, @@ -380,13 +308,13 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } -static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +static inline int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { - return rdmsrl_safe(msr_no, q); + return rdmsrq_safe(msr_no, q); } -static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +static inline int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { - return wrmsrl_safe(msr_no, q); + return wrmsrq_safe(msr_no, q); } static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { @@ -397,5 +325,11 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ + +/* Compatibility wrappers: */ +#define rdmsrl(msr, val) rdmsrq(msr, val) +#define wrmsrl(msr, val) wrmsrq(msr, val) +#define rdmsrl_on_cpu(cpu, msr, q) rdmsrq_on_cpu(cpu, msr, q) + #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index ce857ef54cf1..dd2b129b0418 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -25,29 +25,31 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -static __always_inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx) { - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("monitor" :: "a" (eax), "c" (ecx), "d" (edx)); } -static __always_inline void __monitorx(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) { - /* "monitorx %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfa;" + /* "monitorx %eax, %ecx, %edx" */ + asm volatile(".byte 0x0f, 0x01, 0xfa" :: "a" (eax), "c" (ecx), "d"(edx)); } -static __always_inline void __mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("mwait" :: "a" (eax), "c" (ecx)); } /* @@ -76,13 +78,12 @@ static __always_inline void __mwait(unsigned long eax, unsigned long ecx) * EAX (logical) address to monitor * ECX #GP if not zero */ -static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, - unsigned long ecx) +static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) { /* No MDS buffer clear as this is AMD/HYGON only */ - /* "mwaitx %eax, %ebx, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfb;" + /* "mwaitx %eax, %ebx, %ecx" */ + asm volatile(".byte 0x0f, 0x01, 0xfb" :: "a" (eax), "b" (ebx), "c" (ecx)); } @@ -95,12 +96,11 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, * executing mwait, it would otherwise go unnoticed and the next tick * would not be reprogrammed accordingly before mwait ever wakes up. */ -static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __sti_mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + + asm volatile("sti; mwait" :: "a" (eax), "c" (ecx)); } /* @@ -113,16 +113,13 @@ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx) { if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { - if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { - mb(); - clflush((void *)¤t_thread_info()->flags); - mb(); - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { if (ecx & 1) { @@ -144,16 +141,9 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo */ static inline void __tpause(u32 ecx, u32 edx, u32 eax) { - /* "tpause %ecx, %edx, %eax;" */ - #ifdef CONFIG_AS_TPAUSE - asm volatile("tpause %%ecx\n" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #else - asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #endif + /* "tpause %ecx" */ + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1" + :: "c" (ecx), "d" (edx), "a" (eax)); } #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index f677382093f3..79d88d12c8fb 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -14,12 +14,26 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern int unknown_nmi_panic; - #endif /* CONFIG_X86_LOCAL_APIC */ +extern int unknown_nmi_panic; +extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; + +/* NMI handler flags */ #define NMI_FLAG_FIRST 1 +/** + * enum - NMI types. + * @NMI_LOCAL: Local NMI, CPU-specific NMI generated by the Local APIC. + * @NMI_UNKNOWN: Unknown NMI, the source of the NMI may not be identified. + * @NMI_SERR: System Error NMI, typically triggered by PCI errors. + * @NMI_IO_CHECK: I/O Check NMI, related to I/O errors. + * @NMI_MAX: Maximum value for NMI types. + * + * NMI types are used to categorize NMIs and to dispatch them to the + * appropriate handler. + */ enum { NMI_LOCAL=0, NMI_UNKNOWN, @@ -28,6 +42,7 @@ enum { NMI_MAX }; +/* NMI handler return values */ #define NMI_DONE 0 #define NMI_HANDLED 1 @@ -41,6 +56,25 @@ struct nmiaction { const char *name; }; +/** + * register_nmi_handler - Register a handler for a specific NMI type + * @t: NMI type (e.g. NMI_LOCAL) + * @fn: The NMI handler + * @fg: Flags associated with the NMI handler + * @n: Name of the NMI handler + * @init: Optional __init* attributes for struct nmiaction + * + * Adds the provided handler to the list of handlers for the specified + * NMI type. Handlers flagged with NMI_FLAG_FIRST would be executed first. + * + * Sometimes the source of an NMI can't be reliably determined which + * results in an NMI being tagged as "unknown". Register an additional + * handler using the NMI type - NMI_UNKNOWN to handle such cases. The + * caller would get one last chance to assume responsibility for the + * NMI. + * + * Return: 0 on success, or an error code on failure. + */ #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ @@ -54,7 +88,16 @@ struct nmiaction { int __register_nmi_handler(unsigned int, struct nmiaction *); -void unregister_nmi_handler(unsigned int, const char *); +/** + * unregister_nmi_handler - Unregister a handler for a specific NMI type + * @type: NMI type (e.g. NMI_LOCAL) + * @name: Name of the NMI handler used during registration + * + * Removes the handler associated with the specified NMI type from the + * NMI handler list. The "name" is used as a lookup key to identify the + * handler. + */ +void unregister_nmi_handler(unsigned int type, const char *name); void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8a5cc8e70439..20d754b98f3f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -269,7 +269,7 @@ * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. @@ -279,7 +279,7 @@ VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - "call entry_ibpb", \ibpb_feature, \ + "call write_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm @@ -327,7 +327,7 @@ .endm .macro CLEAR_BRANCH_HISTORY_VMEXIT - ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_VMEXIT .endm #else #define CLEAR_BRANCH_HISTORY @@ -336,10 +336,14 @@ #else /* __ASSEMBLER__ */ +#define ITS_THUNK_SIZE 64 + typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +typedef u8 its_thunk_t[ITS_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; +extern its_thunk_t __x86_indirect_its_thunk_array[]; #ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); @@ -363,12 +367,18 @@ static inline void srso_return_thunk(void) {} static inline void srso_alias_return_thunk(void) {} #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_return_thunk(void); +#else +static inline void its_return_thunk(void) {} +#endif + extern void retbleed_return_thunk(void); extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); -extern void entry_ibpb(void); +extern void write_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); @@ -514,11 +524,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } -extern u64 x86_pred_cmd; - static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ @@ -561,7 +571,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); extern u16 mds_verw_sel; diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index a9b62e0e6f79..623f1e9f493e 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -73,7 +73,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index d3aab6f4e59a..015d23f3e01f 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -62,7 +62,6 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); KCFI_REFERENCE(copy_page); -#ifdef CONFIG_X86_5LEVEL /* * User space process size. This is the first address outside the user range. * There are a few constraints that determine this: @@ -93,7 +92,6 @@ static __always_inline unsigned long task_size_max(void) return ret; } -#endif /* CONFIG_X86_5LEVEL */ #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 1faa8f88850a..7400dab373fe 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -41,25 +41,14 @@ #define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL) #define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL) -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base -#else -#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 -#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 52 - -#ifdef CONFIG_X86_5LEVEL #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) -/* See task_size_max() in <asm/page_64.h> */ -#else -#define __VIRTUAL_MASK_SHIFT 47 -#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) -#endif #define TASK_SIZE_MAX task_size_max() #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f77bf03d747..018a8d906ca3 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -29,9 +29,7 @@ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC /* Physical address where kernel should be loaded. */ -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) +#define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) #define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c4c23190925c..b5e59a7ba0d0 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -175,26 +175,24 @@ static inline void __write_cr4(unsigned long x) PVOP_VCALL1(cpu.write_cr4, x); } -static inline u64 paravirt_read_msr(unsigned msr) +static inline u64 paravirt_read_msr(u32 msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } -static inline void paravirt_write_msr(unsigned msr, - unsigned low, unsigned high) +static inline void paravirt_write_msr(u32 msr, u64 val) { - PVOP_VCALL3(cpu.write_msr, msr, low, high); + PVOP_VCALL2(cpu.write_msr, msr, val); } -static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) +static inline int paravirt_read_msr_safe(u32 msr, u64 *val) { - return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); + return PVOP_CALL2(int, cpu.read_msr_safe, msr, val); } -static inline int paravirt_write_msr_safe(unsigned msr, - unsigned low, unsigned high) +static inline int paravirt_write_msr_safe(u32 msr, u64 val) { - return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); + return PVOP_CALL2(int, cpu.write_msr_safe, msr, val); } #define rdmsr(msr, val1, val2) \ @@ -204,55 +202,46 @@ do { \ val2 = _l >> 32; \ } while (0) -#define wrmsr(msr, val1, val2) \ -do { \ - paravirt_write_msr(msr, val1, val2); \ -} while (0) +static __always_inline void wrmsr(u32 msr, u32 low, u32 high) +{ + paravirt_write_msr(msr, (u64)high << 32 | low); +} -#define rdmsrl(msr, val) \ +#define rdmsrq(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) -static inline void wrmsrl(unsigned msr, u64 val) +static inline void wrmsrq(u32 msr, u64 val) { - wrmsr(msr, (u32)val, (u32)(val>>32)); + paravirt_write_msr(msr, val); } -#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) +static inline int wrmsrq_safe(u32 msr, u64 val) +{ + return paravirt_write_msr_safe(msr, val); +} /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ - int _err; \ - u64 _l = paravirt_read_msr_safe(msr, &_err); \ + u64 _l; \ + int _err = paravirt_read_msr_safe((msr), &_l); \ (*a) = (u32)_l; \ - (*b) = _l >> 32; \ + (*b) = (u32)(_l >> 32); \ _err; \ }) -static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +static __always_inline int rdmsrq_safe(u32 msr, u64 *p) { - int err; - - *p = paravirt_read_msr_safe(msr, &err); - return err; + return paravirt_read_msr_safe(msr, p); } -static inline unsigned long long paravirt_read_pmc(int counter) +static __always_inline u64 rdpmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } -#define rdpmc(counter, low, high) \ -do { \ - u64 _l = paravirt_read_pmc(counter); \ - low = (u32)_l; \ - high = _l >> 32; \ -} while (0) - -#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); @@ -474,8 +463,6 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) PVOP_VCALL2(mmu.set_p4d, p4dp, val); } -#if CONFIG_PGTABLE_LEVELS >= 5 - static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, @@ -507,8 +494,6 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) set_pgd(pgdp, native_make_pgd(0)); \ } while (0) -#endif /* CONFIG_PGTABLE_LEVELS == 5 */ - static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 631c306ce1ff..37a8627d8277 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -91,15 +91,15 @@ struct pv_cpu_ops { unsigned int *ecx, unsigned int *edx); /* Unsafe MSR operations. These will warn or panic on failure. */ - u64 (*read_msr)(unsigned int msr); - void (*write_msr)(unsigned int msr, unsigned low, unsigned high); + u64 (*read_msr)(u32 msr); + void (*write_msr)(u32 msr, u64 val); /* * Safe MSR operations. - * read sets err to 0 or -EIO. write returns 0 or -EIO. + * Returns 0 or -EIO. */ - u64 (*read_msr_safe)(unsigned int msr, int *err); - int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); + int (*read_msr_safe)(u32 msr, u64 *val); + int (*write_msr_safe)(u32 msr, u64 val); u64 (*read_pmc)(int counter); @@ -189,12 +189,10 @@ struct pv_mmu_ops { void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); -#if CONFIG_PGTABLE_LEVELS >= 5 struct paravirt_callee_save p4d_val; struct paravirt_callee_save make_p4d; void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 5fe314a2e73e..b0d03b6c279b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -29,6 +29,8 @@ #ifdef CONFIG_SMP +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" + #ifdef CONFIG_CC_HAS_NAMED_AS #ifdef __CHECKER__ @@ -36,23 +38,23 @@ # define __seg_fs __attribute__((address_space(__seg_fs))) #endif +#define __percpu_prefix #define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) -#define __percpu_prefix "" #else /* !CONFIG_CC_HAS_NAMED_AS: */ +#define __percpu_prefix __force_percpu_prefix #define __percpu_seg_override -#define __percpu_prefix "%%"__stringify(__percpu_seg)":" #endif /* CONFIG_CC_HAS_NAMED_AS */ -#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" -#define __my_cpu_offset this_cpu_read(this_cpu_off) - /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. - * + */ +#define __my_cpu_offset this_cpu_read(this_cpu_off) + +/* * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit * kernel, because games are played with CONFIG_X86_64 there and * sizeof(this_cpu_off) becames 4. @@ -77,9 +79,9 @@ #else /* !CONFIG_SMP: */ +#define __force_percpu_prefix +#define __percpu_prefix #define __percpu_seg_override -#define __percpu_prefix "" -#define __force_percpu_prefix "" #define PER_CPU_VAR(var) (var)__percpu_rel @@ -97,8 +99,8 @@ # define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) #endif -#define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x +#define __percpu_arg(x) __percpu_prefix "%" #x /* * For arch-specific code, we can use direct single-insn ops (they diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 812dac3f79f0..70d1d94aca7e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -195,6 +195,7 @@ union cpuid10_edx { */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +#define ARCH_PERFMON_ACR_LEAF 0x2 union cpuid35_eax { struct { diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index a33147520044..c88691b15f3c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,8 @@ #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +#include <asm/cpufeature.h> + #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> @@ -29,16 +31,17 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * Instead of one PGD, we acquire two PGDs. Being order-1, it is - * both 8k in size and 8k-aligned. That lets us just flip bit 12 - * in a pointer to swap between the two 4k halves. + * In case of Page Table Isolation active, we acquire two PGDs instead of one. + * Being order-1, it is both 8k in size and 8k-aligned. That lets us just + * flip bit 12 in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +static inline unsigned int pgd_allocation_order(void) +{ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + return 1; + return 0; +} /* * Allocate and free page tables. diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 66425424ce91..54690bd4ddbe 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -18,8 +18,6 @@ typedef union { } pte_t; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD 0 - #define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 9d5b257d44e3..580b09bf6a45 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,9 +27,7 @@ typedef union { } pmd_t; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) - -#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7bd6bd6df4a1..5ddba366d3b4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -292,13 +292,6 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_leaf -static inline bool p4d_leaf(p4d_t p4d) -{ - /* No 512 GiB pages yet */ - return 0; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_leaf @@ -1472,9 +1465,6 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_leaf -static inline bool pgd_leaf(pgd_t pgd) { return false; } - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index b89f8f1194a9..f06e5d6a2747 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -41,11 +41,9 @@ static inline void sync_initial_page_table(void) { } pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) -#if CONFIG_PGTABLE_LEVELS >= 5 #define p4d_ERROR(e) \ pr_err("%s:%d: bad p4d %p(%016lx)\n", \ __FILE__, __LINE__, &(e), p4d_val(e)) -#endif #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 5bb782d856f2..4604f924d8b8 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -23,7 +23,6 @@ typedef struct { pmdval_t pmd; } pmd_t; extern unsigned int __pgtable_l5_enabled; -#ifdef CONFIG_X86_5LEVEL #ifdef USE_EARLY_PGTABLE_L5 /* * cpu_feature_enabled() is not available in early boot code. @@ -37,19 +36,11 @@ static inline bool pgtable_l5_enabled(void) #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) #endif /* USE_EARLY_PGTABLE_L5 */ -#else -#define pgtable_l5_enabled() 0 -#endif /* CONFIG_X86_5LEVEL */ - extern unsigned int pgdir_shift; extern unsigned int ptrs_per_p4d; #endif /* !__ASSEMBLER__ */ -#define SHARED_KERNEL_PMD 0 - -#ifdef CONFIG_X86_5LEVEL - /* * PGDIR_SHIFT determines what a top-level page table entry can map */ @@ -67,17 +58,6 @@ extern unsigned int ptrs_per_p4d; #define MAX_POSSIBLE_PHYSMEM_BITS 52 -#else /* CONFIG_X86_5LEVEL */ - -/* - * PGDIR_SHIFT determines what a top-level page table entry can map - */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 -#define MAX_PTRS_PER_P4D 1 - -#endif /* CONFIG_X86_5LEVEL */ - /* * 3rd level page */ @@ -130,15 +110,9 @@ extern unsigned int ptrs_per_p4d; #define __VMEMMAP_BASE_L4 0xffffea0000000000UL #define __VMEMMAP_BASE_L5 0xffd4000000000000UL -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base # define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base -#else -# define VMALLOC_START __VMALLOC_BASE_L4 -# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 -# define VMEMMAP_START __VMEMMAP_BASE_L4 -#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #ifdef CONFIG_RANDOMIZE_MEMORY # define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5d2f7e5aff26..bde58f6510ac 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -16,7 +16,7 @@ struct vm86; #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> @@ -514,15 +514,14 @@ struct thread_struct { struct thread_shstk shstk; #endif - - /* Floating point and extended processor state */ - struct fpu fpu; - /* - * WARNING: 'fpu' is dynamically-sized. It *MUST* be at - * the end. - */ }; +#ifdef CONFIG_X86_DEBUG_FPU +extern struct fpu *x86_task_fpu(struct task_struct *task); +#else +# define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) +#endif + extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); static inline void arch_thread_struct_whitelist(unsigned long *offset, @@ -734,6 +733,7 @@ void store_cpu_caps(struct cpuinfo_x86 *info); enum l1tf_mitigations { L1TF_MITIGATION_OFF, + L1TF_MITIGATION_AUTO, L1TF_MITIGATION_FLUSH_NOWARN, L1TF_MITIGATION_FLUSH, L1TF_MITIGATION_FLUSH_NOSMT, diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 011bf67a1866..bd6afe805cf6 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -9,6 +9,8 @@ #include <linux/resctrl_types.h> #include <linux/sched.h> +#include <asm/msr.h> + /* * This value can never be a valid CLOSID, and is used when mapping a * (closid, rmid) pair to an index and back. On x86 only the RMID is diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ad9212df0ec0..6324f4c6c545 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -52,6 +52,7 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); +extern void startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index acb85b9346d8..0020d77a0800 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -116,7 +116,7 @@ enum psc_op { #define GHCB_MSR_VMPL_REQ 0x016 #define GHCB_MSR_VMPL_REQ_LEVEL(v) \ /* GHCBData[39:32] */ \ - (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \ + ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \ /* GHCBDdata[11:0] */ \ GHCB_MSR_VMPL_REQ) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h new file mode 100644 index 000000000000..3dfd306d1c9e --- /dev/null +++ b/arch/x86/include/asm/sev-internal.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define DR7_RESET_VALUE 0x400 + +extern struct ghcb boot_ghcb_page; +extern u64 sev_hv_features; +extern u64 sev_secrets_pa; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +extern struct svsm_ca boot_svsm_ca_page; + +struct ghcb *__sev_get_ghcb(struct ghcb_state *state); +void __sev_put_ghcb(struct ghcb_state *state); + +DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op); + +DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); +DECLARE_PER_CPU(u64, svsm_caa_pa); + +extern struct svsm_ca *boot_svsm_caa; +extern u64 boot_svsm_caa_pa; + +static __always_inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return boot_svsm_caa; +} + +static __always_inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + +int svsm_perform_call_protocol(struct svsm_call *call); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +void snp_register_ghcb_early(unsigned long paddr); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); +u64 get_hv_features(void); + +const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ba7999f66abe..6158893786d6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -15,6 +15,7 @@ #include <asm/sev-common.h> #include <asm/coco.h> #include <asm/set_memory.h> +#include <asm/svm.h> #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -83,6 +84,36 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + /* PVALIDATE return codes */ #define PVALIDATE_FAIL_SIZEMISMATCH 6 @@ -484,6 +515,34 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +void vc_forward_exception(struct es_em_ctxt *ctxt); + +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); + +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2); + +extern struct ghcb *boot_ghcb; + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h index a341c878e977..b8027b63cd7a 100644 --- a/arch/x86/include/asm/simd.h +++ b/arch/x86/include/asm/simd.h @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_SIMD_H +#define _ASM_SIMD_H #include <asm/fpu/api.h> +#include <linux/compiler_attributes.h> +#include <linux/types.h> /* * may_use_simd - whether it is allowable at this time to issue SIMD @@ -10,3 +14,5 @@ static __must_check inline bool may_use_simd(void) { return irq_fpu_usable(); } + +#endif /* _ASM_SIMD_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 55a5e656e4b9..4f84d421d1cf 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -16,23 +16,23 @@ #ifdef __ASSEMBLER__ #define ASM_CLAC \ - ALTERNATIVE __stringify(ANNOTATE_IGNORE_ALTERNATIVE), "clac", X86_FEATURE_SMAP + ALTERNATIVE "", "clac", X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE __stringify(ANNOTATE_IGNORE_ALTERNATIVE), "stac", X86_FEATURE_SMAP + ALTERNATIVE "", "stac", X86_FEATURE_SMAP #else /* __ASSEMBLER__ */ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ANNOTATE_IGNORE_ALTERNATIVE "", "clac", X86_FEATURE_SMAP); + alternative("", "clac", X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ANNOTATE_IGNORE_ALTERNATIVE "", "stac", X86_FEATURE_SMAP); + alternative("", "stac", X86_FEATURE_SMAP); } static __always_inline unsigned long smap_save(void) @@ -59,9 +59,9 @@ static __always_inline void smap_restore(unsigned long flags) /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "", "clac", X86_FEATURE_SMAP) + ALTERNATIVE("", "clac", X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "", "stac", X86_FEATURE_SMAP) + ALTERNATIVE("", "stac", X86_FEATURE_SMAP) #define ASM_CLAC_UNSAFE \ ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP) diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 658b690b2ccb..00b7e0398210 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -84,7 +84,7 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) static __always_inline void __update_spec_ctrl(u64 val) { __this_cpu_write(x86_spec_ctrl_current, val); - native_wrmsrl(MSR_IA32_SPEC_CTRL, val); + native_wrmsrq(MSR_IA32_SPEC_CTRL, val); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6266d6b9e0b8..ecda17efa042 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -10,30 +10,19 @@ #include <linux/irqflags.h> #include <linux/jump_label.h> -/* - * The compiler should not reorder volatile asm statements with respect to each - * other: they should execute in program order. However GCC 4.9.x and 5.x have - * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder - * volatile asm. The write functions are not affected since they have memory - * clobbers preventing reordering. To prevent reads from being reordered with - * respect to writes, use a dummy memory operand. - */ - -#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) - void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr0,%0" : "=r" (val)); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr2,%0" : "=r" (val)); return val; } @@ -45,7 +34,7 @@ static __always_inline void native_write_cr2(unsigned long val) static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr3,%0" : "=r" (val)); return val; } @@ -66,10 +55,10 @@ static inline unsigned long native_read_cr4(void) asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) - : "=r" (val) : "0" (0), __FORCE_ORDER); + : "=r" (val) : "0" (0)); #else /* CR4 always exists on x86_64. */ - asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr4,%0" : "=r" (val)); #endif return val; } diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 32c0d981a82a..e9cce169bb4c 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -33,11 +33,11 @@ extern size_t strlen(const char *s); static __always_inline void *__memcpy(void *to, const void *from, size_t n) { int d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" - "rep ; movsb\n\t" + "rep movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from) @@ -89,7 +89,7 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, if (n >= 5 * 4) { /* large block: use rep prefix */ int ecx; - asm volatile("rep ; movsl" + asm volatile("rep movsl" : "=&c" (ecx), "=&D" (edi), "=&S" (esi) : "0" (n / 4), "1" (edi), "2" (esi) : "memory" @@ -165,8 +165,7 @@ extern void *memchr(const void *cs, int c, size_t count); static inline void *__memset_generic(void *s, char c, size_t count) { int d0, d1; - asm volatile("rep\n\t" - "stosb" + asm volatile("rep stosb" : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory"); @@ -199,8 +198,7 @@ extern void *memset(void *, int, size_t); static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosw" + asm volatile("rep stosw" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); @@ -211,8 +209,7 @@ static inline void *memset16(uint16_t *s, uint16_t v, size_t n) static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosl" + asm volatile("rep stosl" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index d8416b3bf832..e8e5aab06255 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -9,6 +9,7 @@ #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/msr.h> /* image of the saved processor state */ struct saved_context { diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 54df06687d83..b512f9665f78 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -9,6 +9,7 @@ #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/msr.h> /* * Image of the saved processor state, used by the low level ACPI suspend to diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 75248546403d..499b1c15cc8b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -52,6 +52,8 @@ do { \ } while (0) #ifdef CONFIG_X86_32 +#include <asm/msr.h> + static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ @@ -59,7 +61,7 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) return; this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, thread->sysenter_cs); } #endif diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index ab9e143ec9fe..5337f1be18f6 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -11,11 +11,11 @@ * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ -#define POKE_MAX_OPCODE_SIZE 5 +#define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); -extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); +extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. @@ -32,17 +32,17 @@ extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void text_poke_sync(void); +extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); +extern int smp_text_poke_int3_handler(struct pt_regs *regs); +extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_finish(void); +extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -82,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode) } union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; + u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; @@ -128,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) } extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; +extern __ro_after_init struct mm_struct *text_poke_mm; +extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline @@ -142,13 +142,14 @@ static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* - * The int3 handler in entry_64.S adds a gap between the + * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of - * this gap. See the idtentry macro's create_gap option. + * this gap. See the idtentry macro's X86_TRAP_BP logic. * - * Similarly entry_32.S will have a gap on the stack for (any) hardware - * exception and pt_regs; see FIXUP_FRAME. + * Similarly, entry_32.S will have a gap on the stack for + * (any) hardware exception and pt_regs; see the + * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h deleted file mode 100644 index f0f9bcdb74d9..000000000000 --- a/arch/x86/include/asm/trace/common.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_TRACE_COMMON_H -#define _ASM_TRACE_COMMON_H - -#ifdef CONFIG_TRACING -DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); -#define trace_pagefault_enabled() \ - static_branch_unlikely(&trace_pagefault_key) -#else -static inline bool trace_pagefault_enabled(void) { return false; } -#endif - -#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h deleted file mode 100644 index 6b1e87194809..000000000000 --- a/arch/x86/include/asm/trace/exceptions.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM exceptions - -#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PAGE_FAULT_H - -#include <linux/tracepoint.h> -#include <asm/trace/common.h> - -extern int trace_pagefault_reg(void); -extern void trace_pagefault_unreg(void); - -DECLARE_EVENT_CLASS(x86_exceptions, - - TP_PROTO(unsigned long address, struct pt_regs *regs, - unsigned long error_code), - - TP_ARGS(address, regs, error_code), - - TP_STRUCT__entry( - __field( unsigned long, address ) - __field( unsigned long, ip ) - __field( unsigned long, error_code ) - ), - - TP_fast_assign( - __entry->address = address; - __entry->ip = regs->ip; - __entry->error_code = error_code; - ), - - TP_printk("address=%ps ip=%ps error_code=0x%lx", - (void *)__entry->address, (void *)__entry->ip, - __entry->error_code) ); - -#define DEFINE_PAGE_FAULT_EVENT(name) \ -DEFINE_EVENT_FN(x86_exceptions, name, \ - TP_PROTO(unsigned long address, struct pt_regs *regs, \ - unsigned long error_code), \ - TP_ARGS(address, regs, error_code), \ - trace_pagefault_reg, trace_pagefault_unreg); - -DEFINE_PAGE_FAULT_EVENT(page_fault_user); -DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE exceptions -#endif /* _TRACE_PAGE_FAULT_H */ - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 4645a6334063..0454d5e60e5d 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -74,11 +74,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 88e7f0f3bf62..7408bebdfde0 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,6 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> -#include <asm/trace/common.h> #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 94408a784c8e..4f7f09f50552 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -5,10 +5,65 @@ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H +#include <asm/asm.h> #include <asm/cpufeature.h> #include <asm/processor.h> #include <asm/msr.h> +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer. The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect. The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline u64 rdtsc(void) +{ + EAX_EDX_DECLARE_ARGS(val, low, high); + + asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); + + return EAX_EDX_VAL(val, low, high); +} + +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter. It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline u64 rdtsc_ordered(void) +{ + EAX_EDX_DECLARE_ARGS(val, low, high); + + /* + * The RDTSC instruction is not ordered relative to memory + * access. The Intel SDM and the AMD APM are both vague on this + * point, but empirically an RDTSC instruction can be + * speculatively executed before prior loads. An RDTSC + * immediately after an appropriate barrier appears to be + * ordered as a normal load, that is, it provides the same + * ordering guarantees as reading from a global memory location + * that some other imaginary CPU is updating continuously with a + * time stamp. + * + * Thus, use the preferred barrier on the respective CPU, aiming for + * RDTSCP as the default. + */ + asm volatile(ALTERNATIVE_2("rdtsc", + "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, + "rdtscp", X86_FEATURE_RDTSCP) + : EAX_EDX_RET(val, low, high) + /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ + :: "ecx"); + + return EAX_EDX_VAL(val, low, high); +} + /* * Standard way to access the cycle counter. */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c52f0133425b..c8a5ae35c871 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -26,8 +26,8 @@ extern unsigned long USER_PTR_MAX; */ static inline unsigned long __untagged_addr(unsigned long addr) { - asm (ALTERNATIVE("", - "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]", + X86_FEATURE_LAM) : [addr] "+r" (addr) : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); @@ -54,7 +54,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, #endif #define valid_user_address(x) \ - ((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) + likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) /* * Masking the user address is an alternative to a conditional diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 80be0da733df..b7253ef3205a 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -27,17 +27,9 @@ struct vdso_image { long sym_vdso32_rt_sigreturn_landing_pad; }; -#ifdef CONFIG_X86_64 extern const struct vdso_image vdso_image_64; -#endif - -#ifdef CONFIG_X86_X32_ABI extern const struct vdso_image vdso_image_x32; -#endif - -#if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const struct vdso_image vdso_image_32; -#endif extern int __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index c9b2ba7a9ec4..7000aeb59aa2 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -7,15 +7,15 @@ #ifndef __ASSEMBLER__ -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +/* PAUSE is a good thing to insert into busy-wait loops. */ +static __always_inline void native_pause(void) { - asm volatile("rep; nop" ::: "memory"); + asm volatile("pause" ::: "memory"); } static __always_inline void cpu_relax(void) { - rep_nop(); + native_pause(); } struct getcpu_cache; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 213cf5379a5a..36698cc9fb44 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -292,6 +292,7 @@ struct x86_hyper_runtime { * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus + * @get_nmi_reason get the reason an NMI was received * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if needed diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index bd0fc69a10a7..c2fc7869b996 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -43,7 +43,7 @@ extern struct start_info *xen_start_info; static inline uint32_t xen_cpuid_base(void) { - return hypervisor_cpuid_base(XEN_SIGNATURE, 2); + return cpuid_base_hypervisor(XEN_SIGNATURE, 2); } struct pci_dev; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84cfa179802c..99a783fd4691 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o -obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o obj-$(CONFIG_X86_UMIP) += umip.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dae6a73be40e..9fa321a95eb3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -23,6 +23,8 @@ #include <linux/serial_core.h> #include <linux/pgtable.h> +#include <xen/xen.h> + #include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> @@ -1729,6 +1731,15 @@ int __init acpi_mps_check(void) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ + + /* + * Xen disables ACPI in PV DomU guests but it still emulates APIC and + * supports SMP. Returning early here ensures that APIC is not disabled + * unnecessarily and the guest is not limited to a single vCPU. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + if (acpi_disabled || acpi_noirq) { pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 77bfb846490c..7047124490f6 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -49,7 +49,7 @@ int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) { int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -65,7 +65,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) u64 rd_val; int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -74,7 +74,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) val &= mask; rd_val &= ~mask; rd_val |= val; - err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val); } return err; } @@ -147,7 +147,7 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) int ret; if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); if (ret) goto out; @@ -272,7 +272,7 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) } /* detect if running on heterogeneous design */ - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) { switch (core_type) { case TOPO_CPU_TYPE_UNKNOWN: pr_warn("Undefined core type found for cpu %d\n", cpu); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d5ac34186555..8698d66563ed 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,7 +14,7 @@ #include <acpi/processor.h> #include <asm/cpu_device_id.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/mwait.h> #include <asm/special_insns.h> #include <asm/smp.h> diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 6dfecb27b846..91fa262f0e30 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> #include <asm/hypervisor.h> +#include <asm/msr.h> #include <asm/smp.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bf82c6f7d690..ecfe7b497cad 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,36 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> +#include <linux/execmem.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> +#include <asm/ibt.h> +#include <asm/set_memory.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -124,6 +105,171 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + +#ifdef CONFIG_MITIGATION_ITS + +#ifdef CONFIG_MODULES +static struct module *its_mod; +#endif +static void *its_page; +static unsigned int its_offset; + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int offset = 0; + int i = 0; + +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence (<ea> + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk + offset; +} + +#ifdef CONFIG_MODULES +void its_init_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + WARN_ON_ONCE(its_mod != mod); + + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + execmem_free(page); + } + kfree(mod->its_page_array); +} +#endif /* CONFIG_MODULES */ + +static void *its_alloc(void) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + + if (!page) + return NULL; + +#ifdef CONFIG_MODULES + if (its_mod) { + void *tmp = krealloc(its_mod->its_page_array, + (its_mod->its_num_pages+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + its_mod->its_page_array = tmp; + its_mod->its_page_array[its_mod->its_num_pages++] = page; + + execmem_make_temp_rw(page, PAGE_SIZE); + } +#endif /* CONFIG_MODULES */ + + return no_free_ptr(page); +} + +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif + + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } + + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; + + thunk = its_page + its_offset; + its_offset += size; + + return its_init_thunk(thunk, reg); +} + +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + +#endif + /* * Nomenclature for variable names to simplify and clarify this code and ease * any potential staring at it: @@ -171,13 +317,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -369,7 +508,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -457,7 +596,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DPRINTK(ALT, "alt table %px, -> %px", start, end); /* - * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. @@ -525,7 +664,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -581,7 +720,8 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } -static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; @@ -602,7 +742,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_call_thunk_array[reg], + call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; @@ -610,7 +750,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_jump_thunk_array[reg], + jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; @@ -625,6 +765,48 @@ clang_jcc: return i; } +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_call_thunk_array[reg], + __x86_indirect_jump_thunk_array[reg]); +} + +#ifdef CONFIG_MITIGATION_ITS +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} +#endif + +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -699,6 +881,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; @@ -732,6 +923,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -748,6 +940,12 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -775,6 +973,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_MITIGATION_RETHUNK +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -791,7 +1004,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { @@ -808,7 +1021,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { @@ -1022,8 +1235,6 @@ int cfi_get_func_arity(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; -static bool cfi_paranoid __ro_after_init = false; - /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1436,6 +1647,19 @@ static int cfi_rand_callers(s32 *start, s32 *end) return 0; } +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; @@ -1477,9 +1701,14 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); memcpy(bytes + fineibt_caller_hash, &hash, 4); - ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); - if (WARN_ON_ONCE(ret != 3)) - continue; + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } text_poke_early(addr, bytes, fineibt_paranoid_size); } @@ -1706,29 +1935,66 @@ Efault: return false; } +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + /* * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] - * sequence. + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. */ static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) { unsigned long addr = regs->ip - fineibt_paranoid_ud; - u32 hash; - if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + if (!cfi_paranoid) return false; - __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); - *target = regs->r11 + fineibt_preamble_size; - *type = regs->r10; + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } /* - * Since the trapping instruction is the exact, but LOCK prefixed, - * Jcc.d8 that got us here, the normal fixup will work. + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: <ea> (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * */ - return true; + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } -Efault: return false; } @@ -2010,7 +2276,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -2031,6 +2297,8 @@ static noinline void __init alt_reloc_selftest(void) void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -2057,6 +2325,9 @@ void __init alternative_instructions(void) */ paravirt_set_cap(); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(/*disable*/ true); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -2080,6 +2351,8 @@ void __init alternative_instructions(void) */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + ibt_restore(ibt); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { @@ -2140,76 +2413,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -2229,7 +2434,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2266,7 +2471,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2276,21 +2481,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2299,22 +2504,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2450,7 +2655,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2460,64 +2665,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2526,41 +2733,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2573,16 +2779,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2592,51 +2798,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2649,33 +2854,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2684,7 +2889,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -2715,7 +2920,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -2724,63 +2929,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -2789,14 +3010,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -2805,21 +3026,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -2830,51 +3051,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; - - if (!addr) /* force */ - return true; + WARN_ON_ONCE(!addr); - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + if (!text_poke_array.nr_entries) return true; - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2882,12 +3102,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index c884deca839b..3485d419c2f5 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -39,7 +39,7 @@ #include <asm/gart.h> #include <asm/set_memory.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6d12a9b69432..c1acead6227a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,7 +13,9 @@ #include <linux/export.h> #include <linux/spinlock.h> #include <linux/pci_ids.h> -#include <asm/amd_nb.h> + +#include <asm/amd/nb.h> +#include <asm/cpuid/api.h> static u32 *flush_words; @@ -91,10 +93,7 @@ static int amd_cache_northbridges(void) if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; - /* - * Check for L3 cache presence. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return 0; /* @@ -151,7 +150,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */ if (boot_cpu_data.x86 < 0x10 || - rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) + rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) return NULL; /* mmconfig is not enabled */ diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index b670fa85c61b..a40176b62eb5 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -9,7 +9,7 @@ */ #include <linux/debugfs.h> -#include <asm/amd_node.h> +#include <asm/amd/node.h> /* * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 89c0c8a3fc7e..769321185a08 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -29,7 +29,7 @@ #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 62584a347931..d73ba5a7b623 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -59,6 +59,7 @@ #include <asm/time.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/tsc.h> #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> @@ -425,7 +426,7 @@ static int lapic_next_deadline(unsigned long delta, weak_wrmsr_fence(); tsc = rdtsc(); - wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } @@ -449,7 +450,7 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) * the timer _and_ zero the counter registers: */ if (v & APIC_LVT_TIMER_TSCDEADLINE) - wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + wrmsrq(MSR_IA32_TSC_DEADLINE, 0); else apic_write(APIC_TMICT, 0); @@ -1694,7 +1695,7 @@ static bool x2apic_hw_locked(void) x86_arch_cap_msr = x86_read_arch_cap_msr(); if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { - rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } return false; @@ -1707,12 +1708,12 @@ static void __x2apic_disable(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (!(msr & X2APIC_ENABLE)) return; /* Disable xapic and x2apic first and then reenable xapic mode */ - wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); - wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); printk_once(KERN_INFO "x2apic disabled\n"); } @@ -1720,10 +1721,10 @@ static void __x2apic_enable(void) { u64 msr; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (msr & X2APIC_ENABLE) return; - wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); printk_once(KERN_INFO "x2apic enabled\n"); } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 16410f087b7a..e272bc7fdc8e 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/pgtable.h> +#include <asm/msr.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> @@ -31,7 +32,7 @@ static u32 numachip1_get_apic_id(u32 x) unsigned int id = (x >> 24) & 0xff; if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, value); + rdmsrq(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -42,7 +43,7 @@ static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; - rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg); return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } @@ -150,7 +151,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) /* Account for nodes per socket in multi-core-module processors */ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, val); + rdmsrq(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index eebc360ed1bb..5ba2feb2c04c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1486,7 +1486,7 @@ static void __init delay_with_tsc(void) * 1 GHz == 40 jiffies */ do { - rep_nop(); + native_pause(); now = rdtsc(); } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } @@ -2225,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic) /* Handle device tree enumerated APICs proper */ if (cfg->dev) { - fn = of_node_to_fwnode(cfg->dev); + fn = of_fwnode_handle(cfg->dev); } else { fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index ad4ea6fb3b6c..6259b474073b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -33,6 +33,14 @@ static void __used common(void) { + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); OFFSET(TASK_threadsp, task_struct, thread.sp); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 2b411cd00a4e..e0a292db97b2 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -12,15 +12,6 @@ void foo(void); void foo(void) { - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); - OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); - OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); - OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); - OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); - OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index d86d7d6e750c..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -294,7 +294,7 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } @@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index 303bf74d175b..99444409c026 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -2,6 +2,7 @@ #include <linux/ptrace.h> #include <asm/bugs.h> +#include <asm/msr.h> #include <asm/traps.h> enum cp_error_code { @@ -55,7 +56,7 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) * will be whatever is live in userspace. So read the SSP before enabling * interrupts so locking the fpregs to do it later is not required. */ - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); cond_local_irq_enable(regs); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4efdf5c2efc8..1e26179ff18c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,7 +24,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o -obj-y += cpuid-deps.o +obj-y += cpuid-deps.o cpuid_0x2_table.o obj-y += umwait.o obj-y += capflags.o powerflags.o @@ -38,6 +38,9 @@ obj-y += intel.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o +ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy) +obj-y += amd_cache_disable.o +endif obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 79569f72b8ee..93da466dfe2c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> +#include <asm/amd/fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -21,6 +22,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/sev.h> #ifdef CONFIG_X86_64 @@ -31,7 +33,7 @@ u16 invlpgb_count_max __ro_after_init; -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { u32 gprs[8] = { 0 }; int err; @@ -49,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +static inline int wrmsrq_amd_safe(unsigned msr, u64 val) { u32 gprs[8] = { 0 }; @@ -383,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) (c->x86 == 0x10 && c->x86_model >= 0x2)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -422,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; @@ -472,6 +474,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; + case 0x50 ... 0x5f: + case 0x90 ... 0xaf: + case 0xc0 ... 0xcf: + setup_force_cpu_cap(X86_FEATURE_ZEN6); + break; default: goto warn; } @@ -508,7 +515,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; @@ -525,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (!sme_me_mask) setup_clear_cpu_cap(X86_FEATURE_SME); - rdmsrl(MSR_K7_HWCR, msr); + rdmsrq(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -612,7 +619,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); setup_force_cpu_cap(X86_FEATURE_SBPB); } @@ -636,14 +643,14 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { + if (!rdmsrq_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); - wrmsrl_amd_safe(0xc001100d, value); + wrmsrq_amd_safe(0xc001100d, value); } } if (!c->x86_model_id[0]) - strcpy(c->x86_model_id, "Hammer"); + strscpy(c->x86_model_id, "Hammer"); #ifdef CONFIG_SMP /* @@ -788,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(MSR_F15H_IC_CFG, value); + wrmsrq_safe(MSR_F15H_IC_CFG, value); } } @@ -805,6 +812,7 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static const struct x86_cpu_id erratum_1386_microcode[] = { X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + {} }; static void fix_erratum_1386(struct cpuinfo_x86 *c) @@ -838,9 +846,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * suppresses non-branch predictions. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif @@ -868,6 +876,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); + + /* + * Turn off the Instructions Retired free counter on machines that are + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (c->x86_model < 0x30) { + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + clear_cpu_cap(c, X86_FEATURE_IRPERF); + } } static bool cpu_has_zenbleed_microcode(void) @@ -1014,7 +1032,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -1051,13 +1069,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - /* - * Turn on the Instructions Retired free counter on machines not - * susceptible to erratum #1054 "Instructions Retired Performance - * Counter May Be Inaccurate". - */ - if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) + /* Enable the Instructions Retired free counter */ + if (cpu_has(c, X86_FEATURE_IRPERF)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); @@ -1200,7 +1213,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) return; - wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + wrmsrq(amd_msr_dr_addr_masks[dr], mask); per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; } @@ -1231,3 +1244,56 @@ void amd_check_microcode(void) if (cpu_feature_enabled(X86_FEATURE_ZEN2)) on_each_cpu(zenbleed_check_cpu, NULL, 1); } + +static const char * const s5_reset_reason_txt[] = { + [0] = "thermal pin BP_THERMTRIP_L was tripped", + [1] = "power button was pressed for 4 seconds", + [2] = "shutdown pin was tripped", + [4] = "remote ASF power off command was received", + [9] = "internal CPU thermal limit was tripped", + [16] = "system reset pin BP_SYS_RST_L was tripped", + [17] = "software issued PCI reset", + [18] = "software wrote 0x4 to reset control register 0xCF9", + [19] = "software wrote 0x6 to reset control register 0xCF9", + [20] = "software wrote 0xE to reset control register 0xCF9", + [21] = "ACPI power state transition occurred", + [22] = "keyboard reset pin KB_RST_L was tripped", + [23] = "internal CPU shutdown event occurred", + [24] = "system failed to boot before failed boot timer expired", + [25] = "hardware watchdog timer expired", + [26] = "remote ASF reset command was received", + [27] = "an uncorrected error caused a data fabric sync flood event", + [29] = "FCH and MP1 failed warm reset handshake", + [30] = "a parity error occurred", + [31] = "a software sync flood event occurred", +}; + +static __init int print_s5_reset_status_mmio(void) +{ + unsigned long value; + void __iomem *addr; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value)); + if (!addr) + return 0; + + value = ioread32(addr); + iounmap(addr); + + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { + if (!(value & BIT(i))) + continue; + + if (s5_reset_reason_txt[i]) { + pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n", + value, s5_reset_reason_txt[i]); + } + } + + return 0; +} +late_initcall(print_s5_reset_status_mmio); diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c new file mode 100644 index 000000000000..8843b9557aea --- /dev/null +++ b/arch/x86/kernel/cpu/amd_cache_disable.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD L3 cache_disable_{0,1} sysfs handling + * Documentation/ABI/testing/sysfs-devices-system-cpu + */ + +#include <linux/cacheinfo.h> +#include <linux/capability.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <asm/amd/nb.h> + +#include "cpu.h" + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot) +{ + int index; + struct amd_northbridge *nb = ci->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sysfs_emit(buf, "%d\n", index); + + return sysfs_emit(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return show_cache_disable(ci, buf, slot); \ +} + +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf, + size_t count, unsigned int slot) +{ + struct amd_northbridge *nb = ci->priv; + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&ci->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return store_cache_disable(ci, buf, count, slot); \ +} + +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + + return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *ci = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!ci->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + static struct attribute **amd_l3_attrs; + int n = 1; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci) +{ + struct amd_northbridge *nb = ci->priv; + + if (ci->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +struct amd_northbridge *amd_init_l3_cache(int index) +{ + struct amd_northbridge *nb; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return NULL; + + node = topology_amd_node_id(smp_processor_id()); + nb = node_to_amd_nb(node); + if (nb && !nb->l3_cache.indices) + amd_calc_l3_indices(nb); + + return nb; +} diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6cf31a1649c4..a315b0627dfb 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -20,6 +20,7 @@ #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "cpu.h" @@ -40,8 +41,8 @@ static void init_counter_refs(void) { u64 aperf, mperf; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); @@ -99,7 +100,7 @@ static bool __init turbo_disabled(void) u64 misc_en; int err; - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; @@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; @@ -152,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -190,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s u32 group_size; int err, i; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; @@ -220,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) u64 msr; int err; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -474,8 +475,8 @@ void arch_scale_freq_tick(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4386aa6c69e1..7f94e6a5497d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,21 +34,66 @@ #include "cpu.h" +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ + static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v1_apply_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init spectre_v2_update_mitigation(void); +static void __init spectre_v2_apply_mitigation(void); static void __init retbleed_select_mitigation(void); +static void __init retbleed_update_mitigation(void); +static void __init retbleed_apply_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); +static void __init spectre_v2_user_update_mitigation(void); +static void __init spectre_v2_user_apply_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init ssb_apply_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init l1tf_apply_mitigation(void); static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); +static void __init mds_update_mitigation(void); +static void __init mds_apply_mitigation(void); static void __init taa_select_mitigation(void); +static void __init taa_update_mitigation(void); +static void __init taa_apply_mitigation(void); static void __init mmio_select_mitigation(void); +static void __init mmio_update_mitigation(void); +static void __init mmio_apply_mitigation(void); +static void __init rfds_select_mitigation(void); +static void __init rfds_update_mitigation(void); +static void __init rfds_apply_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init srbds_apply_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init srso_update_mitigation(void); +static void __init srso_apply_mitigation(void); static void __init gds_select_mitigation(void); +static void __init gds_apply_mitigation(void); +static void __init bhi_select_mitigation(void); +static void __init bhi_update_mitigation(void); +static void __init bhi_apply_mitigation(void); +static void __init its_select_mitigation(void); +static void __init its_update_mitigation(void); +static void __init its_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -59,7 +104,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; @@ -67,11 +111,19 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } /* @@ -90,7 +142,7 @@ void update_spec_ctrl_cond(u64 val) * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) @@ -128,9 +180,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +/* + * Controls CPU Fill buffer clear before VMenter. This is a subset of + * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * mitigation is required. + */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); +EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); void __init cpu_select_mitigations(void) { @@ -140,7 +196,7 @@ void __init cpu_select_mitigations(void) * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls @@ -155,30 +211,67 @@ void __init cpu_select_mitigations(void) /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - md_clear_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). + * After mitigations are selected, some may need to update their + * choices. */ - srso_select_mitigation(); - gds_select_mitigation(); + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); } /* @@ -228,9 +321,9 @@ static void x86_amd_ssb_disable(void) u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - wrmsrl(MSR_AMD64_LS_CFG, msrval); + wrmsrq(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt @@ -281,6 +374,12 @@ enum rfds_mitigations { static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing + * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -291,12 +390,34 @@ static void __init mds_select_mitigation(void) if (mds_mitigation == MDS_MITIGATION_AUTO) mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); @@ -336,6 +457,11 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; +static bool __init taa_vulnerable(void) +{ + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} + static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { @@ -349,48 +475,63 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { + if (cpu_mitigations_off()) taa_mitigation = TAA_MITIGATION_OFF; - return; - } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) + taa_mitigation = TAA_MITIGATION_VERW; + + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable() || cpu_mitigations_off()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && - !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -428,31 +569,67 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) + mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + static_branch_disable(&cpu_buf_vm_clear); + } else { + static_branch_enable(&cpu_buf_vm_clear); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can @@ -462,21 +639,6 @@ static void __init mmio_select_mitigation(void) if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } @@ -511,22 +673,48 @@ static const char * const rfds_strings[] = { [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (rfds_mitigation == RFDS_MITIGATION_AUTO) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) rfds_mitigation = RFDS_MITIGATION_VERW; - if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) @@ -547,76 +735,11 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - pr_info("MMIO Stale Data: Unknown: No mitigations\n"); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, @@ -624,7 +747,7 @@ enum srbds_mitigations { }; static enum srbds_mitigations srbds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -656,7 +779,7 @@ void update_srbds_msr(void) if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: @@ -670,13 +793,18 @@ void update_srbds_msr(void) break; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) + srbds_mitigation = SRBDS_MITIGATION_FULL; /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -690,13 +818,17 @@ static void __init srbds_select_mitigation(void) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -743,6 +875,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -751,7 +884,7 @@ enum gds_mitigations { }; static enum gds_mitigations gds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -776,7 +909,7 @@ void update_gds_msr(void) switch (gds_mitigation) { case GDS_MITIGATION_OFF: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: @@ -786,23 +919,24 @@ void update_gds_msr(void) * CPUs. */ case GDS_MITIGATION_FULL: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } @@ -815,33 +949,28 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) + gds_mitigation = GDS_MITIGATION_FULL; + /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); @@ -855,9 +984,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -918,10 +1063,14 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -974,8 +1123,20 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_AUTO, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static enum its_mitigation its_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF; + enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -983,14 +1144,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -1001,9 +1154,7 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; @@ -1020,15 +1171,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1049,77 +1200,122 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} + +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) + goto out; + + /* + * retbleed=stuff is only allowed on Intel. If stuffing can't be used + * then a different mitigation will be selected below. + * + * its=stuff will also attempt to enable stuffing. + */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || + its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { + if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_info("Retbleed mitigation updated to stuffing\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } - break; - -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } + /* If nothing has set the mitigation yet, default to NONE. */ + if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ - break; - } +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1142,7 +1338,7 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -1154,7 +1350,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1164,28 +1360,131 @@ do_cmd_auto: if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); +} + +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_mitigation = ITS_MITIGATION_OFF; + } else if (!strcmp(str, "on")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } else if (!strcmp(str, "force")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + } else if (!strcmp(str, "stuff")) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_AUTO) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + + if (its_mitigation == ITS_MITIGATION_OFF) + return; + + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("RSB stuff mitigation not supported, using default\n"); + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } + + if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY && + !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; +} + +static void __init its_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) + return; + + switch (spectre_v2_enabled) { + case SPECTRE_V2_NONE: + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + case SPECTRE_V2_RETPOLINE: + /* Retpoline+CDT mitigates ITS */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + break; + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + default: + break; + } /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing + * retbleed_update_mitigation() will try to do stuffing if its=stuff. + * If it can't, such as if spectre_v2!=retpoline, then fall back to + * aligned thunks. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); - } - } + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); + pr_info("%s\n", its_strings[its_mitigation]); +} + +static void __init its_apply_mitigation(void) +{ + /* its=stuff forces retbleed stuffing and is enabled there. */ + if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) + return; + + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); } #undef pr_fmt @@ -1265,6 +1564,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, @@ -1303,31 +1604,18 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { - enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; - mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? - SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; - - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: + if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return mode; + return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1338,7 +1626,7 @@ spectre_v2_parse_user_cmdline(void) } pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return mode; + return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1346,60 +1634,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_parse_user_cmdline()) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - static_branch_enable(&switch_vcpu_ibpb); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1417,30 +1717,44 @@ spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_STIBP) || !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} - spectre_v2_user_stibp = mode; +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); + + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { @@ -1592,51 +1906,54 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: - return; + break; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } /* @@ -1657,12 +1974,13 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, BHI_MITIGATION_ON, BHI_MITIGATION_VMEXIT_ONLY, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1684,6 +2002,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { + if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + bhi_mitigation = BHI_MITIGATION_ON; +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ if (bhi_mitigation == BHI_MITIGATION_OFF) return; @@ -1695,95 +2032,101 @@ static void __init bhi_select_mitigation(void) return; } - /* Mitigate in hardware if supported */ - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); return; } pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); } static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + spectre_v2_cmd = spectre_v2_parse_cmdline(); - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) return; - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; - break; - } - - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} + +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO && + !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} + +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1792,8 +2135,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1819,59 +2164,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - if (boot_cpu_has(X86_BUG_BHI)) - bhi_select_mitigation(); - - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1879,28 +2177,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -2089,19 +2385,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) return cmd; } -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; + goto out; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + return; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: @@ -2110,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + ssb_mode = SPEC_STORE_BYPASS_SECCOMP; else - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; + ssb_mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } +out: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2145,16 +2447,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2410,7 +2702,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2458,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } + + if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { + if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + } +} + +static void __init l1tf_apply_mitigation(void) +{ u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; - if (cpu_mitigations_off()) - l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2533,6 +2836,7 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, @@ -2542,14 +2846,6 @@ enum srso_mitigation { SRSO_MITIGATION_BP_SPEC_REDUCE, }; -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", @@ -2561,8 +2857,7 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; static int __init srso_parse_cmdline(char *str) { @@ -2570,15 +2865,15 @@ static int __init srso_parse_cmdline(char *str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2590,132 +2885,85 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + bool has_microcode; - if (!boot_cpu_has_bug(X86_BUG_SRSO) || - cpu_mitigations_off() || - srso_cmd == SRSO_CMD_OFF) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - goto out; - } + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + srso_mitigation = SRSO_MITIGATION_NONE; + + if (srso_mitigation == SRSO_MITIGATION_NONE) + return; + + if (srso_mitigation == SRSO_MITIGATION_AUTO) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - goto out; - } - - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; + srso_mitigation = SRSO_MITIGATION_NONE; + return; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); - } - break; - - case SRSO_CMD_SAFE_RET: - if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; goto ibpb_on_vmexit; + } - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } - break; - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB; - - /* - * IBPB on entry already obviates the need for - * software-based untraining so clear those in case some - * other mitigation like Retbleed has selected them. - */ - setup_clear_cpu_cap(X86_FEATURE_UNRET); - setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - - /* - * There is no need for RSB filling: entry_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { - pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - } + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; break; - ibpb_on_vmexit: - case SRSO_CMD_IBPB_ON_VMEXIT: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; break; } - - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; - - /* - * There is no need for RSB filling: entry_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } + + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; break; default: break; } +} -out: +static void __init srso_update_mitigation(void) +{ + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + if (boot_cpu_has_bug(X86_BUG_SRSO) && + !cpu_mitigations_off() && + !boot_cpu_has(X86_FEATURE_SRSO_NO)) + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ /* * Clear the feature flag if this mitigation is not selected as that * feature flag controls the BpSpecReduce MSR bit toggling in KVM. @@ -2723,8 +2971,52 @@ out: if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); - if (srso_mitigation != SRSO_MITIGATION_NONE) - pr_info("%s\n", srso_strings[srso_mitigation]); + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + set_return_thunk(srso_alias_return_thunk); + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + set_return_thunk(srso_return_thunk); + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; + default: + break; + } } #undef pr_fmt @@ -2819,9 +3111,6 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return sysfs_emit(buf, "Unknown: No mitigations\n"); - if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2839,6 +3128,19 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t old_microcode_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return sysfs_emit(buf, "Unknown: running under hypervisor"); + + return sysfs_emit(buf, "Vulnerable\n"); +} + +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2897,7 +3199,7 @@ static const char *spectre_bhi_state(void) !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; return "; BHI: Vulnerable"; @@ -3006,7 +3308,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: - case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -3021,6 +3322,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_OLD_MICROCODE: + return old_microcode_show_state(buf); + + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3075,10 +3382,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); - else - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) @@ -3100,6 +3404,16 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE); +} + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 237faf7e700c..981f8b1f0792 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -10,6 +10,7 @@ #include <asm/cmdline.h> #include <asm/traps.h> #include <asm/cpu.h> +#include <asm/msr.h> enum split_lock_detect_state { sld_off = 0, @@ -95,15 +96,15 @@ static bool split_lock_verify_msr(bool on) { u64 ctrl, tmp; - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl)) return false; if (on) ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; else ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + if (wrmsrq_safe(MSR_TEST_CTRL, ctrl)) return false; - rdmsrl(MSR_TEST_CTRL, tmp); + rdmsrq(MSR_TEST_CTRL, tmp); return ctrl == tmp; } @@ -137,7 +138,7 @@ static void __init __split_lock_setup(void) return; } - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); if (!split_lock_verify_msr(true)) { pr_info("MSR access failed: Disabled\n"); @@ -145,7 +146,7 @@ static void __init __split_lock_setup(void) } /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -162,7 +163,7 @@ static void sld_update_msr(bool on) if (on) test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); + wrmsrq(MSR_TEST_CTRL, test_ctrl_val); } void split_lock_init(void) @@ -297,7 +298,7 @@ void bus_lock_init(void) if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + rdmsrq(MSR_IA32_DEBUGCTLMSR, val); if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && (sld_state == sld_warn || sld_state == sld_fatal)) || @@ -311,7 +312,7 @@ void bus_lock_init(void) val |= DEBUGCTLMSR_BUS_LOCK_DETECT; } - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); + wrmsrq(MSR_IA32_DEBUGCTLMSR, val); } bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -375,7 +376,7 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set * it have split lock detection. */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps); if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) goto supported; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b3a520959b51..adfa7e8bb865 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -1,35 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Routines to identify caches on Intel CPU. + * x86 CPU caches detection and configuration * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + * Previous changes + * - Venkatesh Pallipadi: Cache identification through CPUID(0x4) + * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure + * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD */ #include <linux/cacheinfo.h> -#include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/pci.h> #include <linux/stop_machine.h> -#include <linux/sysfs.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpuid/api.h> #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 - /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -41,208 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask; /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; + enum _cache_type type :5; + unsigned int level :3; + unsigned int is_self_initializing :1; + unsigned int is_fully_associative :1; + unsigned int reserved :4; + unsigned int num_threads_sharing :12; + unsigned int num_cores_on_die :6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; + unsigned int coherency_line_size :12; + unsigned int physical_line_partition :10; + unsigned int ways_of_associativity :10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { - unsigned int number_of_sets:32; + unsigned int number_of_sets :32; } split; u32 full; }; -struct _cpuid4_info_regs { +struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned int id; unsigned long size; - struct amd_northbridge *nb; }; -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. +/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */ +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +/* + * Fallback AMD CPUID(0x4) emulation + * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d) + * + * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should + * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006). + */ + +#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff +#define AMD_L2_L3_INVALID_ASSOC 0x9 - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ union l1_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; + unsigned line_size :8; + unsigned lines_per_tag :8; + unsigned assoc :8; + unsigned size_in_kb :8; }; - unsigned val; + unsigned int val; }; union l2_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned size_in_kb :16; }; - unsigned val; + unsigned int val; }; union l3_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned res :2; + unsigned size_encoded :14; }; - unsigned val; + unsigned int val; }; +/* L2/L3 associativity mapping */ static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ + [1] = 1, + [2] = 2, + [3] = 3, + [4] = 4, + [5] = 6, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE }; static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; - -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) { - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; + unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d, *l1; union l2_cache l2; union l3_cache l3; - union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; @@ -251,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - switch (leaf) { + l1 = &l1d; + switch (index) { case 1: l1 = &l1i; fallthrough; case 0: if (!l1->val) return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; + + assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; break; case 2: - if (!l2.val) + if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + + /* Use x86_cache_size as it might have K7 errata fixes */ + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: - if (!l3.val) + if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; + + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; } break; default: return; } - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = topology_num_cores_per_package(); + eax->split.is_self_initializing = 1; + eax->split.type = types[index]; + eax->split.level = levels[index]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = topology_num_cores_per_package(); - - if (assoc == 0xffff) + if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE) eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; } -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) +static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax, + union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx) { - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; + if (eax.split.type == CTYPE_NULL) + return -EIO; - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; + id4->eax = eax; + id4->ebx = ebx; + id4->ecx = ecx; + id4->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) -{ - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) +static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); + if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored); + else + legacy_amd_cpuid4(index, &eax, &ebx, &ecx); - return &cache_private_group; + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int node; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored); - node = topology_amd_node_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +static int fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } + u8 cpu_vendor = boot_cpu_data.x86_vendor; - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; + return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ? + amd_fill_cpuid4_info(index, id4) : + intel_fill_cpuid4_info(index, id4); } static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + /* Do a CPUID(op) loop to calculate num_cache_leaves */ + op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4; do { ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CTYPE_NULL); return i; } +/* + * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. + */ + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; if (c->x86 < 0x17) { - /* LLC is at the node level. */ + /* Pre-Zen: LLC is at the node level */ c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Family 17h up to 1F models: LLC is at the core + * complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } else { /* - * LLC ID is calculated from the number of threads sharing the - * cache. - * */ + * Newer families: LLC ID is calculated from the number + * of threads sharing the L3 cache. + */ u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; @@ -683,25 +320,21 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache); + int index_msb = get_count_order(num_sharing_cache); - c->topo.llc_id = c->topo.apicid >> bits; + c->topo.llc_id = c->topo.apicid >> index_msb; } } } void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Hygons are similar to AMD Family 17h up to 1F models: LLC is + * at the core complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } @@ -710,14 +343,10 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) ci->num_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - ci->num_leaves = 4; - else - ci->num_leaves = 3; - } + else if (c->extended_cpuid_level >= 0x80000006) + ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3; } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) @@ -727,148 +356,131 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) ci->num_leaves = find_num_cache_leaves(c); } -void init_intel_cacheinfo(struct cpuinfo_x86 *c) +static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3, + unsigned int l2, unsigned int l1i, unsigned int l1d) { - /* Cache sizes */ - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + /* + * If llc_id is still unset, then cpuid_level < 4, which implies + * that the only possibility left is SMT. Since CPUID(0x2) doesn't + * specify any shared caches and SMT shares all caches, we can + * unconditionally set LLC ID to the package ID so that all + * threads share it. + */ + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; - if (c->cpuid_level > 3) { - /* - * There should be at least one leaf. A non-zero value means - * that the number of leaves has been initialized. - */ - if (!ci->num_leaves) - ci->num_leaves = find_num_cache_leaves(c); + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d); - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < ci->num_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; + if (!l2) + cpu_detect_cache_sizes(c); +} - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; +/* + * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available. + */ +static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c) +{ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } - } - } + if (c->cpuid_level < 2) + return; - /* Don't use CPUID(2) if CPUID(4) is supported. */ - if (!ci->num_leaves && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) { + switch (desc->c_type) { + case CACHE_L1_INST: l1i += desc->c_size; break; + case CACHE_L1_DATA: l1d += desc->c_size; break; + case CACHE_L2: l2 += desc->c_size; break; + case CACHE_L3: l3 += desc->c_size; break; } } - if (new_l1d) - l1d = new_l1d; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); +} - if (new_l1i) - l1i = new_l1i; +static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4) +{ + unsigned int num_threads_sharing; + int index_msb; - if (new_l2) { - l2 = new_l2; - c->topo.llc_id = l2_id; - c->topo.l2c_id = l2_id; - } + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + return c->topo.apicid & ~((1 << index_msb) - 1); +} - if (new_l3) { - l3 = new_l3; - c->topo.llc_id = l3_id; - } +static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID; + unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0; + + if (c->cpuid_level < 4) + return false; /* - * If llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->topo.pkg_id. + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been previously initialized. */ - if (c->topo.llc_id == BAD_APICID) - c->topo.llc_id = c->topo.pkg_id; + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); + + if (!ci->num_leaves) + return false; - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + for (int i = 0; i < ci->num_leaves; i++) { + struct _cpuid4_info id4 = {}; + int ret; - if (!l2) - cpu_detect_cache_sizes(c); + ret = intel_fill_cpuid4_info(i, &id4); + if (ret < 0) + continue; + + switch (id4.eax.split.level) { + case 1: + if (id4.eax.split.type == CTYPE_DATA) + l1d = id4.size / 1024; + else if (id4.eax.split.type == CTYPE_INST) + l1i = id4.size / 1024; + break; + case 2: + l2 = id4.size / 1024; + l2_id = calc_cache_topo_id(c, &id4); + break; + case 3: + l3 = id4.size / 1024; + l3_id = calc_cache_topo_id(c, &id4); + break; + default: + break; + } + } + + c->topo.l2c_id = l2_id; + c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); + return true; +} + +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */ + if (intel_cacheinfo_0x4(c)) + return; + + intel_cacheinfo_0x2(c); } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon + */ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci; - struct cacheinfo *this_leaf; + struct cacheinfo *ci; int i, sibling; /* @@ -880,18 +492,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, this_cpu_ci = get_cpu_cacheinfo(i); if (!this_cpu_ci->info_list) continue; - this_leaf = this_cpu_ci->info_list + index; + + ci = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - nshared = base->eax.split.num_threads_sharing + 1; + nshared = id4->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -905,14 +517,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if ((apicid < first) || (apicid > last)) continue; - this_leaf = this_cpu_ci->info_list + index; + ci = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else @@ -921,25 +532,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, return 1; } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon + */ static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cacheinfo *ci, *sibling_ci; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) { - if (__cache_amd_cpumap_setup(cpu, index, base)) + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, id4)) return; } - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + ci = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &ci->shared_cpu_map); if (num_threads_sharing == 1) return; @@ -949,30 +562,29 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + /* Skip if itself or no cacheinfo */ if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + continue; + + sibling_ci = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &ci->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map); } } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) +static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4, + struct amd_northbridge *nb) { - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; + ci->id = id4->id; + ci->attributes = CACHE_ID; + ci->level = id4->eax.split.level; + ci->type = cache_type_map[id4->eax.split.type]; + ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1; + ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1; + ci->size = id4->size; + ci->number_of_sets = id4->ecx.split.number_of_sets + 1; + ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1; + ci->priv = nb; } int init_cache_level(unsigned int cpu) @@ -987,38 +599,45 @@ int init_cache_level(unsigned int cpu) } /* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input * ECX as cache index. Then right shift apicid by the number's order to get * cache id for this cache node. */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +static void get_cache_id(int cpu, struct _cpuid4_info *id4) { struct cpuinfo_x86 *c = &cpu_data(cpu); unsigned long num_threads_sharing; int index_msb; - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->topo.apicid >> index_msb; + id4->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) { - unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; + struct cacheinfo *ci = this_cpu_ci->info_list; + u8 cpu_vendor = boot_cpu_data.x86_vendor; + struct amd_northbridge *nb = NULL; + struct _cpuid4_info id4 = {}; + int idx, ret; for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + ret = fill_cpuid4_info(idx, &id4); if (ret) return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); + + get_cache_id(cpu, &id4); + + if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) + nb = amd_init_l3_cache(idx); + + ci_info_init(ci++, &id4, nb); + __cache_cpumap_setup(cpu, idx, &id4); } - this_cpu_ci->cpu_map_populated = true; + this_cpu_ci->cpu_map_populated = true; return 0; } @@ -1034,31 +653,33 @@ int populate_cache_leaves(unsigned int cpu) static unsigned long saved_cr4; static DEFINE_RAW_SPINLOCK(cache_disable_lock); +/* + * Cache flushing is the most time-consuming step when programming the + * MTRRs. On many Intel CPUs without known erratas, it can be skipped + * if the CPU declares cache self-snooping support. + */ +static void maybe_flush_caches(void) +{ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + void cache_disable(void) __acquires(cache_disable_lock) { unsigned long cr0; /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots + * This is not ideal since the cache is only flushed/disabled + * for this CPU while the MTRRs are changed, but changing this + * requires more invasive changes to the way the kernel boots. */ - raw_spin_lock(&cache_disable_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_feature_enabled(X86_FEATURE_PGE)) { @@ -1073,9 +694,7 @@ void cache_disable(void) __acquires(cache_disable_lock) if (cpu_feature_enabled(X86_FEATURE_MTRR)) mtrr_disable(); - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); } void cache_enable(void) __releases(cache_disable_lock) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12126adbc3a9..8feb8fd2957a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,7 +29,7 @@ #include <asm/alternative.h> #include <asm/cmdline.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -148,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c) */ info = (struct ppin_info *)id->driver_data; - if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + if (rdmsrq_safe(info->msr_ppin_ctl, &val)) goto clear_ppin; if ((val & 3UL) == 1UL) { @@ -158,13 +158,13 @@ static void ppin_init(struct cpuinfo_x86 *c) /* If PPIN is disabled, try to enable */ if (!(val & 2UL)) { - wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); - rdmsrl_safe(info->msr_ppin_ctl, &val); + wrmsrq_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrq_safe(info->msr_ppin_ctl, &val); } /* Is the enable bit set? */ if (val & 2UL) { - c->ppin = __rdmsr(info->msr_ppin); + c->ppin = native_rdmsrq(info->msr_ppin); set_cpu_cap(c, info->feature); return; } @@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) @@ -321,7 +322,7 @@ static int __init cachesize_setup(char *str) __setup("cachesize=", cachesize_setup); /* Probe for the CPUID instruction */ -bool have_cpuid_p(void) +bool cpuid_feature(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -562,9 +563,9 @@ __noendbr u64 ibt_save(bool disable) u64 msr = 0; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); if (disable) - wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); } return msr; @@ -575,10 +576,10 @@ __noendbr void ibt_restore(u64 save) u64 msr; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); msr &= ~CET_ENDBR_EN; msr |= (save & CET_ENDBR_EN); - wrmsrl(MSR_IA32_S_CET, msr); + wrmsrq(MSR_IA32_S_CET, msr); } } @@ -602,15 +603,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_USER_SHSTK); if (kernel_ibt) - wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN); else - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); cr4_set_bits(X86_CR4_CET); if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); } } @@ -621,8 +622,8 @@ __noendbr void cet_disable(void) cpu_feature_enabled(X86_FEATURE_SHSTK))) return; - wrmsrl(MSR_IA32_S_CET, 0); - wrmsrl(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_U_CET, 0); } /* @@ -751,9 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * No need to load %gs. It is already correct. * * Writing %gs on 64bit would zero GSBASE which would make any per - * CPU operation up to the point of the wrmsrl() fault. + * CPU operation up to the point of the wrmsrq() fault. * - * Set GSBASE to the new offset. Until the wrmsrl() happens the + * Set GSBASE to the new offset. Until the wrmsrq() happens the * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). @@ -761,7 +762,7 @@ void __init switch_gdt_and_percpu_base(int cpu) * This works even with stackprotector enabled because the * per CPU stack canary is 0 in both per CPU areas. */ - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); + wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else /* * %fs is already set to __KERNEL_PERCPU, but after switching GDT @@ -1005,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* AMD-defined flags: level 0x80000001 */ + /* + * Check if extended CPUID leaves are implemented: Max extended + * CPUID leaf must be in the 0x80000001-0x8000ffff range. + */ eax = cpuid_eax(0x80000000); - c->extended_cpuid_level = eax; + c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0; - if ((eax & 0xffff0000) == 0x80000000) { - if (eax >= 0x80000001) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (c->extended_cpuid_level >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_8000_0001_ECX] = ecx; - c->x86_capability[CPUID_8000_0001_EDX] = edx; - } + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } if (c->extended_cpuid_level >= 0x80000007) { @@ -1227,6 +1229,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1238,22 +1244,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), @@ -1288,7 +1297,7 @@ u64 x86_read_arch_cap_msr(void) u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); return x86_arch_cap_msr; } @@ -1318,10 +1327,78 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + +static struct x86_cpu_id cpu_latest_microcode[] = { +#include "microcode/intel-ucode-defs.h" + {} +}; + +static bool __init cpu_has_old_microcode(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode); + + /* Give unknown CPUs a pass: */ + if (!m) { + /* Intel CPUs should be in the list. Warn if not: */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + pr_info("x86/CPU: Model not found in latest microcode list\n"); + return false; + } + + /* + * Hosts usually lie to guests with a super high microcode + * version. Just ignore what hosts tell guests: + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* Consider all debug microcode to be old: */ + if (boot_cpu_data.microcode & BIT(31)) + return true; + + /* Give new microcode a pass: */ + if (boot_cpu_data.microcode >= m->driver_data) + return false; + + /* Uh oh, too old: */ + return true; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (cpu_has_old_microcode()) { + pr_warn("x86/CPU: Running old microcode\n"); + setup_force_cpu_bug(X86_BUG_OLD_MICROCODE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) @@ -1402,15 +1479,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. - * - * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, - * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); - else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) - setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { @@ -1439,9 +1511,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); @@ -1449,6 +1524,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) { + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1630,11 +1711,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (have_cpuid_p()) { + if (cpuid_feature()) { cpu_detect(c); get_cpu_vendor(c); intel_unlock_cpuid_leafs(c); @@ -1749,11 +1830,11 @@ static bool detect_null_seg_behavior(void) */ unsigned long old_base, tmp; - rdmsrl(MSR_FS_BASE, old_base); - wrmsrl(MSR_FS_BASE, 1); + rdmsrq(MSR_FS_BASE, old_base); + wrmsrq(MSR_FS_BASE, 1); loadsegment(fs, 0); - rdmsrl(MSR_FS_BASE, tmp); - wrmsrl(MSR_FS_BASE, old_base); + rdmsrq(MSR_FS_BASE, tmp); + wrmsrq(MSR_FS_BASE, old_base); return tmp == 0; } @@ -1794,11 +1875,11 @@ static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) + if (!cpuid_feature()) return; cpu_detect(c); @@ -1982,9 +2063,9 @@ void enable_sep_cpu(void) */ tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1); + wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32); put_cpu(); } @@ -2091,7 +2172,7 @@ DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_ DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); EXPORT_PER_CPU_SYMBOL(__x86_call_depth); -static void wrmsrl_cstar(unsigned long val) +static void wrmsrq_cstar(unsigned long val) { /* * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR @@ -2099,37 +2180,37 @@ static void wrmsrl_cstar(unsigned long val) * guest. Avoid the pointless write on all Intel CPUs. */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - wrmsrl(MSR_CSTAR, val); + wrmsrq(MSR_CSTAR, val); } static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + wrmsrq_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { - wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ - wrmsrl(MSR_SYSCALL_MASK, + wrmsrq(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| @@ -2198,7 +2279,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - wrmsr(MSR_TSC_AUX, cpudata, 0); + wrmsrq(MSR_TSC_AUX, cpudata); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -2313,8 +2394,8 @@ void cpu_init(void) memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); + wrmsrq(MSR_FS_BASE, 0); + wrmsrq(MSR_KERNEL_GS_BASE, 0); barrier(); x2apic_setup(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 51deb60a9d26..bc38b2d56f26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -75,6 +75,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +struct amd_northbridge *amd_init_l3_cache(int index); +#else +static inline struct amd_northbridge *amd_init_l3_cache(int index) +{ + return NULL; +} +#endif + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index a2fbea0be535..46efcbd6afa4 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, @@ -82,8 +83,12 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, {} }; diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c new file mode 100644 index 000000000000..89bc8db5e9c6 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sizes.h> + +#include <asm/cpuid/types.h> + +#include "cpu.h" + +#define CACHE_ENTRY(_desc, _type, _size) \ + [_desc] = { \ + .c_type = (_type), \ + .c_size = (_size) / SZ_1K, \ + } + +#define TLB_ENTRY(_desc, _type, _entries) \ + [_desc] = { \ + .t_type = (_type), \ + .entries = (_entries), \ + } + +const struct leaf_0x2_table cpuid_0x2_table[256] = { + CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */ + + TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */ + TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */ + TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */ + TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */ + TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */ + TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */ + TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */ +}; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 4a4118784c13..d69757246bde 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) bool enable_vmx; u64 msr; - if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); clear_cpu_cap(c, X86_FEATURE_SGX); return; @@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_SGX_LC_ENABLED; } - wrmsrl(MSR_IA32_FEAT_CTL, msr); + wrmsrq(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6af4a4a90a52..2154f12766fb 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -15,6 +15,7 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> +#include <asm/msr.h> #include "cpu.h" @@ -96,7 +97,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -110,7 +111,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; @@ -194,7 +195,7 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cdc9813871ef..076eaa41b8c8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -6,6 +6,7 @@ #include <linux/minmax.h> #include <linux/smp.h> #include <linux/string.h> +#include <linux/types.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -15,6 +16,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> #include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> @@ -157,7 +159,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c) u64 tme_activate; int keyid_bits; - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate); if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); @@ -299,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * string flag and enhanced fast string capabilities accordingly. */ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { /* X86_FEATURE_ERMS is set based on CPUID */ set_cpu_cap(c, X86_FEATURE_REP_GOOD); @@ -488,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; - if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) { if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); } @@ -498,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) { u64 msr; - if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr)) return; /* Clear all MISC features */ @@ -509,7 +511,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) probe_xeon_phi_r3mwait(c); msr = this_cpu_read(msr_misc_features_shadow); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msr); } /* @@ -646,103 +648,11 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 - -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 - -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 - -#define TLB_DATA_1G 0x16 -#define TLB_DATA_1G_2M_4M 0x17 - -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 - -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 - -/* - * All of leaf 0x2's one-byte TLB descriptors implies the same number of - * entries for their respective TLB types. The 0x63 descriptor is an - * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries - * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for - * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the - * intel_tlb_table[] mapping. - */ -#define TLB_0x63_2M_4M_ENTRIES 32 - -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; -}; - -static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ - { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ - { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ - { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ - { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ - { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ - { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ - { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ - { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative - * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ - { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ - { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ - { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ - { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ - { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ - { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ - { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ - { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ - { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ - { 0x00, 0, 0 } -}; - -static void intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const struct leaf_0x2_table *desc) { - unsigned int entries; - unsigned char k; - - if (desc == 0) - return; - - /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && - intel_tlb_table[k].descriptor != 0; k++) - ; + short entries = desc->entries; - if (intel_tlb_table[k].tlb_type == 0) - return; - - entries = intel_tlb_table[k].entries; - switch (intel_tlb_table[k].tlb_type) { + switch (desc->t_type) { case STLB_4K: tlb_lli_4k = max(tlb_lli_4k, entries); tlb_lld_4k = max(tlb_lld_4k, entries); @@ -799,28 +709,16 @@ static void intel_tlb_lookup(const unsigned char desc) static void intel_detect_tlb(struct cpuinfo_x86 *c) { - int i, j, n; - unsigned int regs[4]; - unsigned char *desc = (unsigned char *)regs; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; if (c->cpuid_level < 2) return; - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) - intel_tlb_lookup(desc[j]); - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) + intel_tlb_lookup(desc); } static const struct cpu_dev intel_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 30b1d63b97f3..bc7671f920a7 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -79,7 +79,7 @@ static int intel_epb_save(void) { u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); /* * Ensure that saved_epb will always be nonzero after this write even if * the EPB value read from the MSR is 0. @@ -94,7 +94,7 @@ static void intel_epb_restore(void) u64 val = this_cpu_read(saved_epb); u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { @@ -111,7 +111,7 @@ static void intel_epb_restore(void) pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); + wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } static struct syscore_ops intel_epb_syscore_ops = { @@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev, u64 epb; int ret; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; @@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev, else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) return -EINVAL; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; - ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); if (ret < 0) return ret; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1075a90141da..9d852c3b2cb5 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -662,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) return; } - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrq(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + wrmsrq(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ for (i = 0; i < num_msrs; i++) @@ -675,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* restore old settings */ if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); + wrmsrq(MSR_K7_HWCR, hwcr); } /* cpu init entry point, called from mce.c with preempt off */ @@ -805,12 +805,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); if (m->status & MCI_STATUS_SYNDV) { - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); } } @@ -834,16 +834,16 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { u64 status, addr = 0; - rdmsrl(msr_stat, status); + rdmsrq(msr_stat, status); if (!(status & MCI_STATUS_VAL)) return false; if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); + rdmsrq(msr_addr, addr); __log_error(bank, status, addr, misc); - wrmsrl(msr_stat, 0); + wrmsrq(msr_stat, 0); return status & MCI_STATUS_DEFERRED; } @@ -862,7 +862,7 @@ static bool _log_error_deferred(unsigned int bank, u32 misc) return true; /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); return true; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f6fd71b64b66..e9b3c5d4a52e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -121,7 +121,7 @@ void mce_prep_record_common(struct mce *m) { m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ m->time = __ktime_get_real_seconds(); } @@ -388,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -444,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -468,7 +468,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs instrumentation_end(); m = &err->m; - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -488,7 +488,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } @@ -684,10 +684,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) struct mce *m = &err->m; if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -702,12 +702,12 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); if (m->status & MCI_STATUS_SYNDV) { - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); - err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); - err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); } } } @@ -753,7 +753,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m->bank = i; barrier(); - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -829,7 +829,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -887,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -899,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -910,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -955,7 +955,7 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1274,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1298,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1306,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1335,7 +1335,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1693,7 +1693,7 @@ out: instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1822,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1863,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void) cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } @@ -1878,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1905,7 +1905,7 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } @@ -2436,7 +2436,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2786,7 +2786,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 06e3cf7229ce..d02c4f556cd0 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -24,10 +24,11 @@ #include <linux/pci.h> #include <linux/uaccess.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/apic.h> #include <asm/irq_vectors.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -475,27 +476,27 @@ static void prepare_msrs(void *info) struct mce m = *(struct mce *)info; u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); if (m.misc) - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrq(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr); if (m.misc) - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + wrmsrq(MSR_IA32_MCx_MISC(b), m.misc); } } @@ -589,7 +590,7 @@ static int inj_bank_set(void *data, u64 val) u64 cap; /* Get bank count on target CPU so we can handle non-uniform values. */ - rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { @@ -613,7 +614,7 @@ static int inj_bank_set(void *data, u64 val) if (cpu_feature_enabled(X86_FEATURE_SMCA)) { u64 ipid; - if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { pr_err("Error reading IPID on CPU%d\n", m->extcpu); return -EINVAL; } @@ -741,15 +742,15 @@ static void check_hw_inj_possible(void) u64 status = MCI_STATUS_VAL, ipid; /* Check whether bank is populated */ - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); if (!ipid) continue; toggle_hw_mce_inject(cpu, true); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); - rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f863df0ff42c..efcf21e9552e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -94,7 +94,7 @@ static bool cmci_supported(int *banks) if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return false; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -106,7 +106,7 @@ static bool lmce_supported(void) if (mca_cfg.lmce_disabled) return false; - rdmsrl(MSR_IA32_MCG_CAP, tmp); + rdmsrq(MSR_IA32_MCG_CAP, tmp); /* * LMCE depends on recovery support in the processor. Hence both @@ -123,7 +123,7 @@ static bool lmce_supported(void) * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEAT_CTL, tmp); + rdmsrq(MSR_IA32_FEAT_CTL, tmp); if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) return false; @@ -141,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh) u64 val; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -184,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val) if (test_bit(bank, mce_banks_ce_disabled)) return true; - rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), *val); /* Already owned by someone else? */ if (*val & MCI_CTL2_CMCI_EN) { @@ -232,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); /* If the enable bit did not stick, this bank should be polled. */ if (!(val & MCI_CTL2_CMCI_EN)) { @@ -324,9 +324,9 @@ static void __cmci_disable_bank(int bank) if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) @@ -430,10 +430,10 @@ void intel_init_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } void intel_clear_lmce(void) @@ -443,9 +443,9 @@ void intel_clear_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val); } /* @@ -460,10 +460,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) case INTEL_SANDYBRIDGE_X: case INTEL_IVYBRIDGE_X: case INTEL_HASWELL_X: - if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; - wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + wrmsrq_safe(MSR_ERROR_CONTROL, error_control); break; } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 95a504ece43e..b5ba598e54cb 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -312,7 +312,7 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {} static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif -noinstr u64 mce_rdmsrl(u32 msr); +noinstr u64 mce_rdmsrq(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index b61028cf5c8a..097e39327942 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -199,6 +199,12 @@ static bool need_sha_check(u32 cur_rev) case 0xa70c0: return cur_rev <= 0xa70C009; break; case 0xaa001: return cur_rev <= 0xaa00116; break; case 0xaa002: return cur_rev <= 0xaa00218; break; + case 0xb0021: return cur_rev <= 0xb002146; break; + case 0xb1010: return cur_rev <= 0xb101046; break; + case 0xb2040: return cur_rev <= 0xb204031; break; + case 0xb4040: return cur_rev <= 0xb404031; break; + case 0xb6000: return cur_rev <= 0xb600031; break; + case 0xb7000: return cur_rev <= 0xb700031; break; default: break; } @@ -211,11 +217,9 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi { struct patch_digest *pd = NULL; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state s; int i; - if (x86_family(bsp_cpuid_1_eax) < 0x17 || - x86_family(bsp_cpuid_1_eax) > 0x19) + if (x86_family(bsp_cpuid_1_eax) < 0x17) return true; if (!need_sha_check(cur_rev)) @@ -230,9 +234,7 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return false; } - sha256_init(&s); - sha256_update(&s, data, len); - sha256_final(&s, digest); + sha256(data, len, digest); if (memcmp(digest, pd->sha256, sizeof(digest))) { pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); @@ -602,7 +604,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) return false; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); + native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr); if (x86_family(bsp_cpuid_1_eax) == 0x17) { unsigned long p_addr_end = p_addr + psize - 1; @@ -1093,15 +1095,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz static int __init save_microcode_in_initrd(void) { - unsigned int cpuid_1_eax = native_cpuid_eax(1); struct cpuinfo_x86 *c = &boot_cpu_data; struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; enum ucode_state ret; struct cpio_data cp; - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; + cpuid_1_eax = native_cpuid_eax(1); + if (!find_blobs_in_containers(&cp)) return -EINVAL; @@ -1171,11 +1175,18 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +static void finalize_late_load_amd(int result) +{ + if (result) + cleanup(); +} + static struct microcode_ops microcode_amd_ops = { .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, + .finalize_late_load = finalize_late_load_amd, .nmi_safe = true, }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b3658d11e7b6..fe50eb5b7c4a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -37,12 +37,13 @@ #include <asm/perf_event.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/msr.h> #include <asm/setup.h> #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = false; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); module_param(force_minrev, bool, S_IRUSR | S_IWUSR); @@ -84,6 +85,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -95,27 +99,29 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; - - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } - - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!cpuid_feature() || + native_cpuid_ecx(1) & BIT(31) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; return dis_ucode_ldr; } @@ -125,7 +131,10 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -146,9 +155,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -159,6 +165,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -686,6 +697,8 @@ static int load_late_locked(void) return load_late_stop_cpus(true); case UCODE_NFOUND: return -ENOENT; + case UCODE_OK: + return 0; default: return -EBADFD; } @@ -810,7 +823,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h new file mode 100644 index 000000000000..cb6e601701ab --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -0,0 +1,150 @@ +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 }, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 819199bc0119..371ca6eac00e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -320,7 +320,7 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, } /* write microcode via MSR 0x79 */ - native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) @@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 5df621752fef..50a9702ae4e2 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 3e2533954675..c78f860419d6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -30,6 +30,7 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/msr.h> #include <asm/numa.h> #include <asm/svm.h> @@ -70,7 +71,7 @@ u64 hv_get_non_nested_msr(unsigned int reg) if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else - rdmsrl(reg, value); + rdmsrq(reg, value); return value; } EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); @@ -82,9 +83,9 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_msr(reg)) - wrmsrl(reg, value | 1 << 20); + wrmsrq(reg, value | 1 << 20); } else { - wrmsrl(reg, value); + wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); @@ -345,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void) { unsigned long freq; - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); return freq / 1000; } @@ -541,7 +542,7 @@ static void __init ms_hyperv_init_platform(void) */ u64 hv_lapic_frequency; - rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", @@ -574,7 +575,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index cf29681d01e0..d987b11c168c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,6 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -145,10 +146,10 @@ static inline void cache_alloc_hsw_probe(void) struct rdt_resource *r = &hw_res->r_resctrl; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) + if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); + rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ if (l3_cbm_0 != max_cbm) @@ -309,7 +310,7 @@ static void mba_wrmsr_amd(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -334,7 +335,7 @@ static void mba_wrmsr_intel(struct msr_param *m) /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); + wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } static void cat_wrmsr(struct msr_param *m) @@ -344,7 +345,7 @@ static void cat_wrmsr(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } u32 resctrl_arch_get_num_closid(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a93ed7d2a160..591b0b44d260 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -238,7 +239,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); + rdmsrq(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) return -EIO; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 92ea1472bde9..1190c48a16b2 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -25,6 +25,7 @@ #include <asm/cpu_device_id.h> #include <asm/resctrl.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "internal.h" @@ -481,8 +482,8 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ - saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); + native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); mem_r = plr->kmem; @@ -494,7 +495,7 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); /* * Cache was flushed earlier. Now access kernel memory to read it @@ -531,10 +532,10 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -904,7 +905,7 @@ int resctrl_arch_measure_cycles_lat_fn(void *_plr) * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed @@ -1000,7 +1001,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); /* Initialize rest of local variables */ /* @@ -1018,8 +1019,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * used in L1 cache, second to capture accurate value that does not * include cache misses incurred because of instruction loads. */ - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * From SDM: Performing back-to-back fast reads are not guaranteed * to be monotonic. @@ -1027,8 +1028,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1050,8 +1051,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_after); - rdpmcl(miss_pmcnum, miss_after); + hits_after = rdpmc(hit_pmcnum); + miss_after = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 93ec829015f1..c85ace29ea3a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -28,6 +28,7 @@ #include <uapi/linux/magic.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -1635,7 +1636,7 @@ void resctrl_arch_mon_event_config_read(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; @@ -1707,7 +1708,7 @@ void resctrl_arch_mon_event_config_write(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0); + wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } static void mbm_config_write_domain(struct rdt_resource *r, @@ -2326,14 +2327,14 @@ static void l3_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } static void l2_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } static inline bool is_mba_linear(void) @@ -3553,6 +3554,22 @@ static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) free_rmid(rgrp->closid, rgrp->mon.rmid); } +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3568,6 +3585,15 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + if (rtype == RDTMON_GROUP && (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { @@ -3751,22 +3777,6 @@ out_unlock: return ret; } -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { @@ -3782,11 +3792,8 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - /* - * If RDT monitoring is supported and the parent directory is a valid - * "mon_groups" directory, add a monitoring subdirectory. - */ - if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 16f3ca30626a..dbf6d71bdf18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, @@ -53,7 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, - { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, + { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8ce352fc72ac..6722b2fc82cf 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/vmalloc.h> +#include <asm/msr.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -871,7 +872,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); + wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 01456236a6dd..e35ccdc84910 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,6 +30,7 @@ #include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> +#include <asm/msr.h> #include <asm/smp.h> #include "cpu.h" @@ -154,7 +155,7 @@ static __init bool check_for_real_bsp(u32 apic_id) * kernel must rely on the firmware enumeration order. */ if (has_apic_base) { - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); } diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 03b3c9c3a45e..843b1655ab45 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -3,6 +3,7 @@ #include <asm/apic.h> #include <asm/memtype.h> +#include <asm/msr.h> #include <asm/processor.h> #include "cpu.h" @@ -133,7 +134,7 @@ static void parse_fam10h_node_id(struct topo_scan *tscan) if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) return; - rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + rdmsrq(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; } @@ -160,7 +161,7 @@ static void topoext_fixup(struct topo_scan *tscan) if (msr_set_bit(0xc0011005, 54) <= 0) return; - rdmsrl(0xc0011005, msrval); + rdmsrq(0xc0011005, msrval); if (msrval & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); @@ -182,7 +183,7 @@ static void parse_topology_amd(struct topo_scan *tscan) if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) has_topoext = cpu_parse_topology_ext(tscan); - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); if (!has_topoext && !parse_8000_0008(tscan)) diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index b31ee4f1657a..49782724a943 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -12,6 +12,7 @@ #include <asm/cmdline.h> #include <asm/cpu.h> +#include <asm/msr.h> #include "cpu.h" @@ -24,7 +25,7 @@ static void tsx_disable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Force all transactions to immediately abort */ tsx |= TSX_CTRL_RTM_DISABLE; @@ -37,14 +38,14 @@ static void tsx_disable(void) */ tsx |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static void tsx_enable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Enable the RTM feature in the cpu */ tsx &= ~TSX_CTRL_RTM_DISABLE; @@ -56,7 +57,7 @@ static void tsx_enable(void) */ tsx &= ~TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) @@ -115,13 +116,13 @@ static void tsx_clear_cpuid(void) */ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { - rdmsrl(MSR_TSX_FORCE_ABORT, msr); + rdmsrq(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; - wrmsrl(MSR_TSX_FORCE_ABORT, msr); + wrmsrq(MSR_TSX_FORCE_ABORT, msr); } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { - rdmsrl(MSR_IA32_TSX_CTRL, msr); + rdmsrq(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, msr); + wrmsrq(MSR_IA32_TSX_CTRL, msr); } } @@ -146,11 +147,11 @@ static void tsx_dev_mode_disable(void) !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); if (mcu_opt_ctrl & RTM_ALLOW) { mcu_opt_ctrl &= ~RTM_ALLOW; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 2293efd6ffa6..933fcd7ff250 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock); static void umwait_update_control_msr(void * unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached)); } /* @@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu) * the original control MSR value in umwait_init(). So there * is no race condition here. */ - wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); return 0; } @@ -214,7 +214,7 @@ static int __init umwait_init(void) * changed. This is the only place where orig_umwait_control_cached * is modified. */ - rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, umwait_cpu_offline); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 90eba7eb5335..89b1c8a70fe8 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> +#include <asm/msr.h> #include "cpu.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6fefd4585f8..71ee20102a8a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -23,8 +23,6 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -int panic_on_unrecovered_nmi; -int panic_on_io_nmi; static int die_counter; static struct pt_regs exec_summary_regs; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 57120f0749cc..9920122018a0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -753,22 +753,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; - unsigned long pfn = 0; + u64 last_addr = 0; for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(entry->addr)) - register_nosave_region(pfn, PFN_UP(entry->addr)); - - pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM) - register_nosave_region(PFN_UP(entry->addr), pfn); + continue; - if (pfn >= limit_pfn) - break; + if (last_addr < entry->addr) + register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr)); + + last_addr = entry->addr + entry->size; } + + register_nosave_region(PFN_DOWN(last_addr), limit_pfn); } #ifdef CONFIG_ACPI @@ -1300,6 +1299,14 @@ void __init e820__memblock_setup(void) memblock_add(entry->addr, entry->size); } + /* + * 32-bit systems are limited to 4BG of memory even with HIGHMEM and + * to even less without it. + * Discard memory after max_pfn - the actual limit detected at runtime. + */ + if (IS_ENABLED(CONFIG_X86_32)) + memblock_remove(PFN_PHYS(max_pfn), -1); + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 611f27e3890c..cba75306e5b6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/console.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> @@ -144,6 +145,11 @@ static __init void early_serial_hw_init(unsigned divisor) static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); + +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + if (static_call_query(serial_in) == io_serial_in) + kexec_debug_8250_port = early_serial_base; +#endif } #define DEFAULT_BAUD 9600 @@ -327,6 +333,9 @@ static __init void early_pci_serial_init(char *s) /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK; +#endif write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_MEMORY); } @@ -389,10 +398,10 @@ static int __init setup_early_printk(char *buf) keep = (strstr(buf, "keep") != NULL); while (*buf != '\0') { - if (!strncmp(buf, "mmio", 4)) { - early_mmio_serial_init(buf + 4); + if (!strncmp(buf, "mmio32", 6)) { + buf += 6; + early_mmio_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 4; } if (!strncmp(buf, "serial", 6)) { buf += 6; @@ -407,9 +416,9 @@ static int __init setup_early_printk(char *buf) } #ifdef CONFIG_PCI if (!strncmp(buf, "pciserial", 9)) { - early_pci_serial_init(buf + 9); + buf += 9; /* Keep from match the above "pciserial" */ + early_pci_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 9; /* Keep from match the above "serial" */ } #endif if (!strncmp(buf, "vga", 3) && diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index f6d856bd50bc..10d0a720659c 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -53,7 +53,7 @@ static inline void fpregs_activate(struct fpu *fpu) /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) @@ -67,7 +67,7 @@ static inline void fpregs_restore_userregs(void) * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. + * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 91d6341f281f..ea138583dd92 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -11,6 +11,7 @@ #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/irq_regs.h> @@ -43,14 +44,27 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* Track in-kernel FPU usage */ -static DEFINE_PER_CPU(bool, in_kernel_fpu); +/* + * Track FPU initialization and kernel-mode usage. 'true' means the FPU is + * initialized and is not currently being used by the kernel: + */ +DEFINE_PER_CPU(bool, kernel_fpu_allowed); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); +#ifdef CONFIG_X86_DEBUG_FPU +struct fpu *x86_task_fpu(struct task_struct *task) +{ + if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) + return NULL; + + return (void *)task + sizeof(*task); +} +#endif + /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? @@ -61,15 +75,18 @@ bool irq_fpu_usable(void) return false; /* - * In kernel FPU usage already active? This detects any explicitly - * nested usage in task or softirq context, which is unsupported. It - * also detects attempted usage in a hardirq that has interrupted a - * kernel-mode FPU section. + * Return false in the following cases: + * + * - FPU is not yet initialized. This can happen only when the call is + * coming from CPU onlining, for example for microcode checksumming. + * - The kernel is already using the FPU, either because of explicit + * nesting (which should never be done), or because of implicit + * nesting when a hardirq interrupted a kernel-mode FPU section. + * + * The single boolean check below handles both cases: */ - if (this_cpu_read(in_kernel_fpu)) { - WARN_ON_FPU(!in_hardirq()); + if (!this_cpu_read(kernel_fpu_allowed)) return false; - } /* * When not in NMI or hard interrupt context, FPU can be used in: @@ -202,7 +219,7 @@ void fpu_reset_from_exception_fixup(void) #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); -static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +static void fpu_lock_guest_permissions(void) { struct fpu_state_perm *fpuperm; u64 perm; @@ -211,15 +228,13 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu) return; spin_lock_irq(¤t->sighand->siglock); - fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + fpuperm = &x86_task_fpu(current->group_leader)->guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); - - gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) @@ -240,7 +255,6 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_kernel_cfg.default_features; - gfpu->perm = fpu_kernel_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -255,7 +269,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; - fpu_init_guest_permissions(gfpu); + fpu_lock_guest_permissions(); return true; } @@ -263,16 +277,16 @@ EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { - struct fpstate *fps = gfpu->fpstate; + struct fpstate *fpstate = gfpu->fpstate; - if (!fps) + if (!fpstate) return; - if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use)) return; gfpu->fpstate = NULL; - vfree(fps); + vfree(fpstate); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); @@ -323,12 +337,12 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); */ void fpu_sync_guest_vmexit_xfd_state(void) { - struct fpstate *fps = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { - rdmsrl(MSR_IA32_XFD, fps->xfd); - __this_cpu_write(xfd_state, fps->xfd); + rdmsrq(MSR_IA32_XFD, fpstate->xfd); + __this_cpu_write(xfd_state, fpstate->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); @@ -337,7 +351,7 @@ EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); @@ -431,14 +445,15 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); - WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); + /* Toggle kernel_fpu_allowed to false: */ + WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, false); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - save_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(x86_task_fpu(current)); } __cpu_invalidate_fpregs_state(); @@ -453,9 +468,10 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + /* Toggle kernel_fpu_allowed back to true: */ + WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, true); - this_cpu_write(in_kernel_fpu, false); if (!irqs_disabled()) fpregs_unlock(); } @@ -467,7 +483,7 @@ EXPORT_SYMBOL_GPL(kernel_fpu_end); */ void fpu_sync_fpstate(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); trace_x86_fpu_before_save(fpu); @@ -552,7 +568,7 @@ void fpstate_reset(struct fpu *fpu) static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { - struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + struct fpu *src_fpu = x86_task_fpu(current->group_leader); spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ @@ -572,7 +588,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) if (!ssp) return 0; - xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave, XFEATURE_CET_USER); /* @@ -593,8 +609,16 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { - struct fpu *src_fpu = ¤t->thread.fpu; - struct fpu *dst_fpu = &dst->thread.fpu; + /* + * We allocate the new FPU structure right after the end of the task struct. + * task allocation size already took this into account. + * + * This is safe because task_struct size is a multiple of cacheline size, + * thus x86_task_fpu() will always be cacheline aligned as well. + */ + struct fpu *dst_fpu = (void *)dst + sizeof(*dst); + + BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -657,19 +681,22 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, if (update_fpu_shstk(dst, ssp)) return 1; - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* - * Whitelist the FPU register state embedded into task_struct for hardened - * usercopy. + * While struct fpu is no longer part of struct thread_struct, it is still + * allocated after struct task_struct in the "task_struct" kmem cache. But + * since FPU is expected to be part of struct thread_struct, we have to + * adjust for it here. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + /* The allocation follows struct task_struct. */ + *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread); + *offset += offsetof(struct fpu, __fpstate.regs); *size = fpu_kernel_cfg.default_size; } @@ -682,11 +709,18 @@ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) * a state-restore is coming: either an explicit one, * or a reschedule. */ -void fpu__drop(struct fpu *fpu) +void fpu__drop(struct task_struct *tsk) { + struct fpu *fpu; + + if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD)) + return; + + fpu = x86_task_fpu(tsk); + preempt_disable(); - if (fpu == ¤t->thread.fpu) { + if (fpu == x86_task_fpu(current)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -718,9 +752,9 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpregs(void) +static void fpu_reset_fpstate_regs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); @@ -749,11 +783,11 @@ static void fpu_reset_fpregs(void) */ void fpu__clear_user_states(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpregs(); + fpu_reset_fpstate_regs(); fpregs_unlock(); return; } @@ -782,8 +816,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpstate_reset(¤t->thread.fpu); - fpu_reset_fpregs(); + fpstate_reset(x86_task_fpu(current)); + fpu_reset_fpstate_regs(); } /* * Load FPU context before returning to userspace. @@ -823,7 +857,7 @@ void fpregs_lock_and_load(void) */ void fpregs_assert_state_consistent(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; @@ -835,7 +869,7 @@ EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); void fpregs_mark_activate(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 998a08f17e33..99db41bf9fa6 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + ; else #endif asm volatile ("fninit"); @@ -51,6 +51,9 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); + + /* Start allowing kernel-mode FPU: */ + this_cpu_write(kernel_fpu_allowed, true); } static bool __init fpu__probe_without_cpuid(void) @@ -73,6 +76,8 @@ static bool __init fpu__probe_without_cpuid(void) static void __init fpu__init_system_early_generic(void) { + set_thread_flag(TIF_NEED_FPU_LOAD); + if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { if (fpu__probe_without_cpuid()) @@ -94,7 +99,6 @@ static void __init fpu__init_system_early_generic(void) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; -EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { @@ -150,11 +154,13 @@ static void __init fpu__init_task_struct_size(void) { int task_size = sizeof(struct task_struct); + task_size += sizeof(struct fpu); + /* * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(current->thread.fpu.__fpstate.regs); + task_size -= sizeof(union fpregs_state); /* * Add back the dynamically-calculated register state @@ -164,14 +170,9 @@ static void __init fpu__init_task_struct_size(void) /* * We dynamically size 'struct fpu', so we require that - * it be at the end of 'thread_struct' and that - * 'thread_struct' be at the end of 'task_struct'. If - * you hit a compile error here, check the structure to - * see if something got added to the end. + * 'state' be at the end of 'it: */ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); - CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); - CHECK_MEMBER_AT_END_OF(struct task_struct, thread); arch_task_struct_size = task_size; } @@ -204,7 +205,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; - fpstate_reset(¤t->thread.fpu); } /* @@ -213,7 +213,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(void) { - fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(); /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 887b0b8e21e3..0986c2200adc 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -45,7 +45,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r */ static void sync_fpstate(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) + if (fpu == x86_task_fpu(current)) fpu_sync_fpstate(fpu); } @@ -63,7 +63,7 @@ static void fpu_force_restore(struct fpu *fpu) * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); + WARN_ON_FPU(fpu == x86_task_fpu(current)); __fpu_invalidate_fpregs_state(fpu); } @@ -71,7 +71,7 @@ static void fpu_force_restore(struct fpu *fpu) int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -91,7 +91,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct fxregs_state newstate; int ret; @@ -133,7 +133,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - sync_fpstate(&target->thread.fpu); + sync_fpstate(x86_task_fpu(target)); copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; @@ -143,7 +143,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *tmpbuf = NULL; int ret; @@ -187,7 +187,7 @@ int ssp_active(struct task_struct *target, const struct user_regset *regset) int ssp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct cet_user_state *cetregs; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || @@ -214,7 +214,7 @@ int ssp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *xsave = &fpu->fpstate->regs.xsave; struct cet_user_state *cetregs; unsigned long user_ssp; @@ -368,7 +368,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); + __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -401,7 +401,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave, int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; @@ -433,7 +433,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; int ret; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 6c69cb28b298..c3ec2512f2bb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -43,13 +43,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: - trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + trace_x86_fpu_xstate_check_failed(x86_task_fpu(current)); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; @@ -64,13 +64,13 @@ setfx: static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; + struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); + fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -114,7 +114,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; - u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ @@ -128,12 +127,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, (__u32 __user *)(buf + fpstate->user_size)); /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - - /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to @@ -144,9 +137,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XFEATURE_MASK_FPSSE; - - err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE); return !err; } @@ -184,7 +175,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pk bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; - struct fpstate *fpstate = tsk->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; @@ -272,7 +263,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int ret; /* Restore enabled features only. */ @@ -332,7 +323,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; + struct fpu *fpu = x86_task_fpu(tsk); struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; @@ -452,7 +443,7 @@ static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; @@ -499,7 +490,7 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); + unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6a41d1610d8b..9aa9ac8399ae 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -14,13 +14,15 @@ #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> +#include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> @@ -62,6 +64,7 @@ static const char *xfeature_names[] = "unknown xstate feature", "AMX Tile config", "AMX Tile data", + "APX registers", "unknown xstate feature", }; @@ -80,6 +83,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, + [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -88,6 +92,31 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; +/* + * Ordering of xstate components in uncompacted format: The xfeature + * number does not necessarily indicate its position in the XSAVE buffer. + * This array defines the traversal order of xstate features. + */ +static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; + +static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) +{ + for (; xfeature_uncompact_order[i] != -1; i++) { + if (mask & BIT_ULL(xfeature_uncompact_order[i])) + break; + } + + return i; +} + +/* Iterate xstate features in uncompacted order: */ +#define for_each_extended_xfeature_in_order(i, mask) \ + for (i = 0; \ + i = next_xfeature_order(i, mask), \ + xfeature_uncompact_order[i] != -1; \ + i++) + #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) @@ -199,7 +228,7 @@ void fpu__init_cpu_xstate(void) * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } @@ -209,16 +238,20 @@ static bool xfeature_enabled(enum xfeature xfeature) return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } +static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) +{ + return xstate_offsets[*(unsigned int *)xfeature1] - + xstate_offsets[*(unsigned int *)xfeature2]; +} + /* * Record the offsets and sizes of various xstates contained - * in the XSAVE state memory layout. + * in the XSAVE state memory layout. Also, create an ordered + * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { - u32 eax, ebx, ecx, edx, i; - /* start at the beginning of the "extended state" */ - unsigned int last_good_offset = offsetof(struct xregs_state, - extended_state_area); + u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form @@ -232,31 +265,30 @@ static void __init setup_xstate_cache(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); + for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { + cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); - xstate_sizes[i] = eax; - xstate_flags[i] = ecx; + xstate_sizes[xfeature] = eax; + xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ - if (xfeature_is_supervisor(i)) + if (xfeature_is_supervisor(xfeature)) continue; - xstate_offsets[i] = ebx; + xstate_offsets[xfeature] = ebx; - /* - * In our xstate size checks, we assume that the highest-numbered - * xstate feature has the highest offset in the buffer. Ensure - * it does. - */ - WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); - - last_good_offset = xstate_offsets[i]; + /* Populate the list of xfeatures before sorting */ + xfeature_uncompact_order[i++] = xfeature; } + + /* + * Sort xfeatures by their offsets to support out-of-order + * offsets in the uncompacted format. + */ + sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* @@ -340,7 +372,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * setup the xstate image representing the init state @@ -540,6 +573,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); @@ -552,13 +586,20 @@ static bool __init check_xstate_against_struct(int nr) static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; - unsigned int offset = xstate_offsets[topmost]; + unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); - if (compacted) + if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); + } else { + /* Walk through the xfeature order to pick the last */ + for_each_extended_xfeature_in_order(i, xfeatures) + topmost = xfeature_uncompact_order[i]; + offset = xstate_offsets[topmost]; + } + return offset + xstate_sizes[topmost]; } @@ -639,7 +680,7 @@ static unsigned int __init get_xsave_compacted_size(void) return get_compacted_size(); /* Disable independent features. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. @@ -648,7 +689,7 @@ static unsigned int __init get_xsave_compacted_size(void) size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } @@ -711,6 +752,8 @@ static int __init init_xstate_size(void) */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { + pr_info("x86/fpu: XSAVE disabled\n"); + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); @@ -727,7 +770,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = 0; - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); } /* @@ -775,6 +818,17 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && + fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { + /* + * This is a problematic CPU configuration where two + * conflicting state components are both enumerated. + */ + pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", + fpu_kernel_cfg.max_features); + goto out_disable; + } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; @@ -834,9 +888,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (err) goto out_disable; - /* Reset the state for the current task */ - fpstate_reset(¤t->thread.fpu); - /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: @@ -852,7 +903,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { - pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } @@ -864,7 +915,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { - pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } @@ -904,12 +955,12 @@ void fpu__resume_cpu(void) * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) - wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); + wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* @@ -1071,10 +1122,9 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; + unsigned int zerofrom, i, xfeature; struct xstate_header header; - unsigned int zerofrom; u64 mask; - int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; @@ -1143,15 +1193,16 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = header.xfeatures; - for_each_extended_xfeature(i, mask) { + for_each_extended_xfeature_in_order(i, mask) { + xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ - if (zerofrom < xstate_offsets[i]) - membuf_zero(&to, xstate_offsets[i] - zerofrom); + if (zerofrom < xstate_offsets[xfeature]) + membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); - if (i == XFEATURE_PKRU) { + if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the @@ -1161,14 +1212,14 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, - __raw_xsave_addr(xsave, i), - xstate_sizes[i]); + __raw_xsave_addr(xsave, xfeature), + xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ - zerofrom = xstate_offsets[i] + xstate_sizes[i]; + zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: @@ -1191,8 +1242,8 @@ out: void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, - tsk->thread.fpu.fpstate->user_xfeatures, + __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, + x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1332,7 +1383,7 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); + return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) @@ -1398,9 +1449,9 @@ void xrstors(struct xregs_state *xstate, u64 mask) } #if IS_ENABLED(CONFIG_KVM) -void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { - void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); @@ -1426,7 +1477,7 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ - if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* @@ -1503,7 +1554,7 @@ void fpstate_free(struct fpu *fpu) static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; @@ -1596,7 +1647,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; @@ -1606,16 +1657,20 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) if ((permitted & requested) == requested) return 0; - /* Calculate the resulting kernel state size */ + /* + * Calculate the resulting kernel state size. Note, @permitted also + * contains supervisor xfeatures even though supervisor are always + * permitted for kernel and guest FPUs, and never permitted for user + * FPUs. + */ mask = permitted | requested; - /* Take supervisor states into account on the host */ - if (!guest) - mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); - /* Calculate the resulting user state size */ - mask &= XFEATURE_MASK_USER_SUPPORTED; - usize = xstate_calculate_size(mask, false); + /* + * Calculate the resulting user state size. Take care not to clobber + * the supervisor xfeatures in the new mask! + */ + usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); @@ -1699,7 +1754,7 @@ int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) return -EPERM; } - fpu = ¤t->group_leader->thread.fpu; + fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; @@ -1804,7 +1859,7 @@ long fpu_xstate_prctl(int option, unsigned long arg2) */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); long delta; if (!timestamp) { diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 0fd34f53f025..52ce19289989 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -5,6 +5,7 @@ #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); @@ -22,7 +23,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) static inline u64 xstate_get_group_perm(bool guest) { - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ @@ -69,21 +70,31 @@ static inline u64 xfeatures_mask_independent(void) return fpu_kernel_cfg.independent_features; } +static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask) +{ + u64 xfeatures; + int err; + + /* Read the xfeatures value already saved in the user buffer */ + err = __get_user(xfeatures, &xbuf->header.xfeatures); + xfeatures |= mask; + err |= __put_user(xfeatures, &xbuf->header.xfeatures); + + return err; +} + /* * Update the value of PKRU register that was already pushed onto the signal frame. */ -static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) { - u64 xstate_bv; int err; if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) return 0; /* Mark PKRU as in-use so that it is restored correctly. */ - xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; - - err = __put_user(xstate_bv, &buf->header.xfeatures); + err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU); if (err) return err; @@ -171,7 +182,7 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { - wrmsrl(MSR_IA32_XFD, xfd); + wrmsrq(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } @@ -288,7 +299,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - struct fpstate *fpstate = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; @@ -307,7 +318,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr clac(); if (!err) - err = update_pkru_in_sigframe(buf, mask, pkru); + err = update_pkru_in_sigframe(buf, pkru); return err; } @@ -322,7 +333,7 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 u32 hmask = mask >> 32; int err; - xfd_validate_state(current->thread.fpu.fpstate, mask, true); + xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 5e2cd1004980..816187da3a47 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -3,6 +3,7 @@ #include <asm/desc.h> #include <asm/fred.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/traps.h> @@ -43,23 +44,23 @@ void cpu_init_fred_exceptions(void) */ loadsegment(ss, __KERNEL_DS); - wrmsrl(MSR_IA32_FRED_CONFIG, + wrmsrq(MSR_IA32_FRED_CONFIG, /* Reserve for CALL emulation */ FRED_CONFIG_REDZONE | FRED_CONFIG_INT_STKLVL(0) | FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); - wrmsrl(MSR_IA32_FRED_STKLVLS, 0); + wrmsrq(MSR_IA32_FRED_STKLVLS, 0); /* * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be * resynchronized with its per-CPU cache. */ - wrmsrl(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); + wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); - wrmsrl(MSR_IA32_FRED_RSP1, 0); - wrmsrl(MSR_IA32_FRED_RSP2, 0); - wrmsrl(MSR_IA32_FRED_RSP3, 0); + wrmsrq(MSR_IA32_FRED_RSP1, 0); + wrmsrq(MSR_IA32_FRED_RSP2, 0); + wrmsrq(MSR_IA32_FRED_RSP3, 0); /* Enable FRED */ cr4_set_bits(X86_CR4_FRED); @@ -79,14 +80,14 @@ void cpu_init_fred_rsps(void) * (remember that user space faults are always taken on stack level 0) * is to avoid overflowing the kernel stack. */ - wrmsrl(MSR_IA32_FRED_STKLVLS, + wrmsrq(MSR_IA32_FRED_STKLVLS, FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); /* The FRED equivalents to IST stacks... */ - wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); - wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); - wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cace6e8d7cc7..252e82bcfd2f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -247,10 +247,10 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) @@ -354,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(ip)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); @@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index de001b2146ab..375f2d7f1762 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void) *ptr = (unsigned long)ptep + PAGE_OFFSET; #ifdef CONFIG_MICROCODE_INITRD32 - /* Running on a hypervisor? */ - if (native_cpuid_ecx(1) & BIT(31)) - return; - params = (struct boot_params *)__pa_nodebug(&boot_params); if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image) return; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fa9b6339975f..533fcf5636fc 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -47,234 +47,22 @@ * Manage page tables very early on. */ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt; +unsigned int __initdata next_early_pgt; +SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); -#ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); -#endif -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); -#endif - -static inline bool check_la57_support(void) -{ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) - return false; - - /* - * 5-level paging is detected and enabled at kernel decompression - * stage. Only check if it has been enabled there. - */ - if (!(native_read_cr4() & X86_CR4_LA57)) - return false; - - RIP_REL_REF(__pgtable_l5_enabled) = 1; - RIP_REL_REF(pgdir_shift) = 48; - RIP_REL_REF(ptrs_per_p4d) = 512; - RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; - RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; - RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; - - return true; -} - -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, - pmdval_t *pmd, - unsigned long p2v_offset) -{ - unsigned long paddr, paddr_end; - int i; - - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (sme_get_me_mask()) { - paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); - paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - - for (; paddr < paddr_end; paddr += PMD_SIZE) { - /* - * On SNP, transition the page to shared in the RMP table so that - * it is consistent with the page table attribute change. - * - * __start_bss_decrypted has a virtual address in the high range - * mapping (kernel .text). PVALIDATE, by way of - * early_snp_set_memory_shared(), requires a valid virtual - * address but the kernel is currently running off of the identity - * mapping so use the PA to get a *currently* valid virtual address. - */ - early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - - i = pmd_index(paddr - p2v_offset); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -/* Code in __startup_64() can be relocated during execution, but the compiler - * doesn't have to generate PC-relative relocations when accessing globals from - * that function. Clang actually does not generate them, which leads to - * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined - * by subtracting p2v_offset from the RIP-relative address. - */ -unsigned long __head __startup_64(unsigned long p2v_offset, - struct boot_params *bp) -{ - pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); - unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); - unsigned long va_text, va_end; - unsigned long pgtable_flags; - unsigned long load_delta; - pgdval_t *pgd; - p4dval_t *p4d; - pudval_t *pud; - pmdval_t *pmd, pmd_entry; - bool la57; - int i; - - la57 = check_la57_support(); - - /* Is the address too large? */ - if (physaddr >> MAX_PHYSMEM_BITS) - for (;;); - - /* - * Compute the delta between the address I am compiled to run at - * and the address I am actually running at. - */ - load_delta = __START_KERNEL_map + p2v_offset; - RIP_REL_REF(phys_base) = load_delta; - - /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_MASK) - for (;;); - - va_text = physaddr - p2v_offset; - va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; - - /* Include the SME encryption mask in the fixup value */ - load_delta += sme_get_me_mask(); - - /* Fixup the physical addresses in the page table */ - - pgd = &RIP_REL_REF(early_top_pgt)->pgd; - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { - p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); - p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; - } - - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - - for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ - - pud = &early_pgts[0]->pmd; - pmd = &early_pgts[1]->pmd; - RIP_REL_REF(next_early_pgt) = 2; - - pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - - if (la57) { - p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; - - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; - pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - - i = physaddr >> P4D_SHIFT; - p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - } else { - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + pgtable_flags; - pgd[i + 1] = (pgdval_t)pud + pgtable_flags; - } - - i = physaddr >> PUD_SHIFT; - pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - - pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; - /* Filter out unsupported __PAGE_KERNEL_* bits: */ - pmd_entry &= RIP_REL_REF(__supported_pte_mask); - pmd_entry += sme_get_me_mask(); - pmd_entry += physaddr; - - for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT); - - pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; - } - - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - * - * Only the region occupied by the kernel image has so far - * been checked against the table of usable memory regions - * provided by the firmware, so invalidate pages outside that - * region. A page table entry that maps to a reserved area of - * memory would allow processor speculation into that area, - * and on some hardware (particularly the UV platform) even - * speculative access to some reserved areas is caught as an - * error, causing the BIOS to halt the system. - */ - - pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; - - /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index(va_text); i++) - pmd[i] &= ~_PAGE_PRESENT; - - /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index(va_end); i++) - if (pmd[i] & _PAGE_PRESENT) - pmd[i] += load_delta; - - /* invalidate pages after the kernel image */ - for (; i < PTRS_PER_PMD; i++) - pmd[i] &= ~_PAGE_PRESENT; - - return sme_postprocess_startup(bp, pmd, p2v_offset); -} /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) @@ -449,6 +237,12 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode /* Kill off the identity-map trampoline */ reset_early_page_tables(); + if (pgtable_l5_enabled()) { + page_offset_base = __PAGE_OFFSET_BASE_L5; + vmalloc_base = __VMALLOC_BASE_L5; + vmemmap_base = __VMEMMAP_BASE_L5; + } + clear_bss(); /* @@ -513,41 +307,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) start_kernel(); } -/* - * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is - * used until the idt_table takes over. On the boot CPU this happens in - * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases - * this happens in the functions called from head_64.S. - * - * The idt_table can't be used that early because all the code modifying it is - * in idt.c and can be instrumented by tracing or KASAN, which both don't work - * during early CPU bringup. Also the idt_table has the runtime vectors - * configured which require certain CPU state to be setup already (like TSS), - * which also hasn't happened yet in early CPU bringup. - */ -static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; - -/* This may run while still in the direct mapping */ -static void __head startup_64_load_idt(void *vc_handler) -{ - struct desc_ptr desc = { - .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), - .size = sizeof(bringup_idt_table) - 1, - }; - struct idt_data data; - gate_desc idt_desc; - - /* @vc_handler is set only for a VMM Communication Exception */ - if (vc_handler) { - init_idt_data(&data, X86_TRAP_VC, vc_handler); - idt_init_desc(&idt_desc, &data); - native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); - } - - native_load_idt(&desc); -} - -/* This is used when running on kernel addresses */ void early_setup_idt(void) { void *handler = NULL; @@ -559,30 +318,3 @@ void early_setup_idt(void) startup_64_load_idt(handler); } - -/* - * Setup boot CPU state needed before kernel switches to virtual addresses. - */ -void __head startup_64_setup_gdt_idt(void) -{ - struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; - void *handler = NULL; - - struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(*gdt), - .size = GDT_SIZE - 1, - }; - - /* Load GDT */ - native_load_gdt(&startup_gdt_descr); - - /* New GDT is live - reload data segment registers */ - asm volatile("movl %%eax, %%ds\n" - "movl %%eax, %%ss\n" - "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) - handler = &RIP_REL_REF(vc_no_ghcb); - - startup_64_load_idt(handler); -} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2e42056d2306..76743dfad6ab 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -86,7 +86,7 @@ SYM_CODE_START(startup_32) movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx - rep ; stosl + rep stosl /* * Copy bootup parameters out of the way. * Note: %esi still has the pointer to the real-mode data. @@ -98,15 +98,13 @@ SYM_CODE_START(startup_32) movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld - rep - movsl + rep movsl movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No command line movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl + rep movsl 1: #ifdef CONFIG_OLPC diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fefe2a25cf02..3e9b3a3bd039 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -573,6 +573,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_no_ghcb) +SYM_PIC_ALIAS(vc_no_ghcb); #endif #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION @@ -604,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) +SYM_PIC_ALIAS(early_top_pgt) SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 SYM_DATA_END(early_dynamic_pgts) +SYM_PIC_ALIAS(early_dynamic_pgts); SYM_DATA(early_recursion_flag, .long 0) @@ -646,12 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt) SYM_DATA_END(init_top_pgt) #endif -#ifdef CONFIG_X86_5LEVEL SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level4_kernel_pgt) -#endif +SYM_PIC_ALIAS(level4_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -659,6 +661,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level3_kernel_pgt) +SYM_PIC_ALIAS(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* @@ -676,6 +679,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) +SYM_PIC_ALIAS(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 @@ -688,6 +692,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 SYM_DATA_END(level2_fixmap_pgt) +SYM_PIC_ALIAS(level2_fixmap_pgt) SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) @@ -703,6 +708,7 @@ SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) +SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) #include "../xen/xen-head.S" diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7f4b2966e15c..d6387dde3ff9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,11 +7,12 @@ #include <linux/cpu.h> #include <linux/irq.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/mwait.h> +#include <asm/msr.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -970,7 +971,7 @@ static bool __init hpet_is_pc10_damaged(void) return false; /* Check whether PC10 is enabled in PKG C-state limit */ - rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg); if ((pcfg & 0xF) < 8) return false; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 80e262bb627f..cb9852ad6098 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -46,7 +46,8 @@ bool __init pit_timer_init(void) * VMMs otherwise steal CPU time just to pointlessly waggle * the (masked) IRQ. */ - clockevent_i8253_disable(); + scoped_guard(irq) + clockevent_i8253_disable(); return false; } clockevent_i8253_init(true); diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index cd8ed1edbf9e..9e9a591a5fec 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -49,7 +49,7 @@ static uint32_t jailhouse_cpuid_base(void) !boot_cpu_has(X86_FEATURE_HYPERVISOR)) return 0; - return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); + return cpuid_base_hypervisor("Jailhouse\0\0\0", 0); } static uint32_t __init jailhouse_detect(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 09608fd93687..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 3be9b3342c67..921c1c783bc1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -40,6 +40,7 @@ #include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/msr.h> #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> @@ -301,7 +302,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); - wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1); } set_irq_regs(old_regs); @@ -327,7 +328,7 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } @@ -361,9 +362,9 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; - wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + wrmsrq(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -376,7 +377,7 @@ static void kvm_guest_cpu_init(void) __this_cpu_write(kvm_apic_eoi, 0); pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) | KVM_MSR_ENABLED; - wrmsrl(MSR_KVM_PV_EOI_EN, pa); + wrmsrq(MSR_KVM_PV_EOI_EN, pa); } if (has_steal_clock) @@ -388,7 +389,7 @@ static void kvm_pv_disable_apf(void) if (!__this_cpu_read(async_pf_enabled)) return; - wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + wrmsrq(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); @@ -399,7 +400,7 @@ static void kvm_disable_steal_time(void) if (!has_steal_clock) return; - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); + wrmsrq(MSR_KVM_STEAL_TIME, 0); } static u64 kvm_steal_clock(int cpu) @@ -451,9 +452,9 @@ static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); + wrmsrq(MSR_KVM_PV_EOI_EN, 0); if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -615,7 +616,7 @@ static int __init setup_efi_kvm_sev_migration(void) } pr_info("%s : live migration enabled in EFI\n", __func__); - wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); return 1; } @@ -728,7 +729,7 @@ static int kvm_suspend(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) - rdmsrl(MSR_KVM_POLL_CONTROL, val); + rdmsrq(MSR_KVM_POLL_CONTROL, val); has_guest_poll = !(val & 1); #endif return 0; @@ -740,7 +741,7 @@ static void kvm_resume(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); #endif } @@ -874,7 +875,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base(KVM_SIGNATURE, 0); + return cpuid_base_hypervisor(KVM_SIGNATURE, 0); return 0; } @@ -975,7 +976,7 @@ static void __init kvm_init_platform(void) * If not booted using EFI, enable Live migration support. */ if (!efi_enabled(EFI_BOOT)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); } kvmclock_init(); @@ -1124,12 +1125,12 @@ out: static void kvm_disable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); } static void kvm_enable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 1); + wrmsrq(MSR_KVM_POLL_CONTROL, 1); } void arch_haltpoll_enable(unsigned int cpu) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b2c15214a6b..ca0a49eeac4a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -60,7 +60,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); */ static void kvm_get_wallclock(struct timespec64 *now) { - wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); preempt_disable(); pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); preempt_enable(); @@ -173,7 +173,7 @@ static void kvm_register_clock(char *txt) return; pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; - wrmsrl(msr_kvm_system_time, pa); + wrmsrq(msr_kvm_system_time, pa); pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } @@ -196,7 +196,7 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { if (msr_kvm_system_time) - native_write_msr(msr_kvm_system_time, 0, 0); + native_write_msr(msr_kvm_system_time, 0); } static void __init kvmclock_init_mem(void) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 80265162aeff..1f325304c4a8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -42,7 +42,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)image->arch.pgd, pgd_allocation_order()); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PGD_ALLOCATION_ORDER); + pgd_allocation_order()); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a68f5a0a9f37..949c9e4bfad2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -76,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif +static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p) +{ + unsigned long mstart, mend; + + if (!kexec_debug_8250_mmio32) + return 0; + + mstart = kexec_debug_8250_mmio32 & PAGE_MASK; + mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK; + pr_info("Map PCI serial at %lx - %lx\n", mstart, mend); + return kernel_ident_mapping_init(info, level4p, mstart, mend); +} + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -285,6 +298,10 @@ static int init_pgtable(struct kimage *image, unsigned long control_page) if (result) return result; + result = map_mmio_serial(&info, image->arch.pgd); + if (result) + return result; + /* * This must be last because the intermediate page table pages it * allocates will not be control pages and may overlap the image. @@ -304,6 +321,24 @@ static void load_segments(void) ); } +static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs) +{ + gate_desc idtentry = { 0 }; + int i; + + idtentry.bits.p = 1; + idtentry.bits.type = GATE_TRAP; + idtentry.segment = __KERNEL_CS; + idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs; + idtentry.offset_middle = (control_page >> 16) & 0xFFFF; + idtentry.offset_high = control_page >> 32; + + for (i = 0; i < 16; i++) { + kexec_debug_idt[i] = idtentry; + idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE; + } +} + int machine_kexec_prepare(struct kimage *image) { void *control_page = page_address(image->control_code_page); @@ -321,6 +356,9 @@ int machine_kexec_prepare(struct kimage *image) if (image->type == KEXEC_TYPE_DEFAULT) kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + prepare_debug_idt((unsigned long)__pa(control_page), + (unsigned long)kexec_debug_exc_vectors - reloc_start); + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); set_memory_rox((unsigned long)control_page, 1); @@ -396,16 +434,10 @@ void __nocfi machine_kexec(struct kimage *image) * with from a table in memory. At no other time is the * descriptor table in memory accessed. * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. + * Take advantage of this here by force loading the segments, + * before the GDT is zapped with an invalid value. */ load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - native_idt_invalidate(); - native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 1f54eedc3015..ef6104e7cc72 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -97,7 +97,7 @@ static void get_fam10h_pci_mmconf_base(void) /* SYS_CFG */ address = MSR_AMD64_SYSCFG; - rdmsrl(address, val); + rdmsrq(address, val); /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { @@ -105,7 +105,7 @@ static void get_fam10h_pci_mmconf_base(void) } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); + rdmsrq(address, val); tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } @@ -177,7 +177,7 @@ void fam10h_check_enable_mmcfg(void) return; address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, val); + rdmsrq(address, val); /* try to make sure that AP's setting is identical to BSP setting */ if (val & FAM10H_MMIO_CONF_ENABLE) { @@ -212,7 +212,7 @@ void fam10h_check_enable_mmcfg(void) (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)); val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; - wrmsrl(address, val); + wrmsrq(address, val); } static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index a7998f351701..0ffbae902e2f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } @@ -266,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } + its_init_mod(me); + if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -286,6 +288,9 @@ int module_finalize(const Elf_Ehdr *hdr, void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + + its_fini_mod(me); + if (returns) { void *rseg = (void *)returns->sh_addr; apply_returns(rseg, rseg + returns->sh_size); @@ -326,4 +331,5 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + its_free_mod(mod); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a95d00f1423..be93ec7255bf 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -49,27 +49,20 @@ struct nmi_desc { struct list_head head; }; -static struct nmi_desc nmi_desc[NMI_MAX] = -{ - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), - .head = LIST_HEAD_INIT(nmi_desc[0].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), - .head = LIST_HEAD_INIT(nmi_desc[1].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), - .head = LIST_HEAD_INIT(nmi_desc[2].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), - .head = LIST_HEAD_INIT(nmi_desc[3].head), - }, +#define NMI_DESC_INIT(type) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \ + .head = LIST_HEAD_INIT(nmi_desc[type].head), \ +} +static struct nmi_desc nmi_desc[NMI_MAX] = { + NMI_DESC_INIT(NMI_LOCAL), + NMI_DESC_INIT(NMI_UNKNOWN), + NMI_DESC_INIT(NMI_SERR), + NMI_DESC_INIT(NMI_IO_CHECK), }; +#define nmi_to_desc(type) (&nmi_desc[type]) + struct nmi_stats { unsigned int normal; unsigned int unknown; @@ -91,6 +84,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static int ignore_nmis __read_mostly; int unknown_nmi_panic; +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; + /* * Prevent NMI reason port (0x61) being accessed simultaneously, can * only be used in NMI handler. @@ -104,8 +100,6 @@ static int __init setup_unknown_nmi_panic(char *str) } __setup("unknown_nmi_panic", setup_unknown_nmi_panic); -#define nmi_to_desc(type) (&nmi_desc[type]) - static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; static int __init nmi_warning_debugfs(void) @@ -125,12 +119,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(duration, (1000 * 1000)); - decimal_msecs = remainder_ns / 1000; + /* Convert duration from nsec to msec */ + remainder_ns = do_div(duration, NSEC_PER_MSEC); + decimal_msecs = remainder_ns / NSEC_PER_USEC; - printk_ratelimited(KERN_INFO - "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, duration, decimal_msecs); + pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) @@ -333,10 +327,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) int handled; /* - * Use 'false' as back-to-back NMIs are dealt with one level up. - * Of course this makes having multiple 'unknown' handlers useless - * as only the first one is ever run (unless it can actually determine - * if it caused the NMI) + * As a last resort, let the "unknown" handlers make a + * best-effort attempt to figure out if they can claim + * responsibility for this Unknown NMI. */ handled = nmi_handle(NMI_UNKNOWN, regs); if (handled) { @@ -366,17 +359,18 @@ static noinstr void default_do_nmi(struct pt_regs *regs) bool b2b = false; /* - * CPU-specific NMI must be processed before non-CPU-specific - * NMI, otherwise we may lose it, because the CPU-specific - * NMI can not be detected/processed on other CPUs. - */ - - /* - * Back-to-back NMIs are interesting because they can either - * be two NMI or more than two NMIs (any thing over two is dropped - * due to NMI being edge-triggered). If this is the second half - * of the back-to-back NMI, assume we dropped things and process - * more handlers. Otherwise reset the 'swallow' NMI behaviour + * Back-to-back NMIs are detected by comparing the RIP of the + * current NMI with that of the previous NMI. If it is the same, + * it is assumed that the CPU did not have a chance to jump back + * into a non-NMI context and execute code in between the two + * NMIs. + * + * They are interesting because even if there are more than two, + * only a maximum of two can be detected (anything over two is + * dropped due to NMI being edge-triggered). If this is the + * second half of the back-to-back NMI, assume we dropped things + * and process more handlers. Otherwise, reset the 'swallow' NMI + * behavior. */ if (regs->ip == __this_cpu_read(last_nmi_rip)) b2b = true; @@ -390,6 +384,11 @@ static noinstr void default_do_nmi(struct pt_regs *regs) if (microcode_nmi_handler_enabled() && microcode_nmi_handler()) goto out; + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -426,13 +425,14 @@ static noinstr void default_do_nmi(struct pt_regs *regs) pci_serr_error(reason, regs); else if (reason & NMI_REASON_IOCHK) io_check_error(reason, regs); -#ifdef CONFIG_X86_32 + /* * Reassert NMI in case it became active * meanwhile as it's edge-triggered: */ - reassert_nmi(); -#endif + if (IS_ENABLED(CONFIG_X86_32)) + reassert_nmi(); + __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); goto out; @@ -751,4 +751,3 @@ void local_touch_nmi(void) { __this_cpu_write(last_nmi_rip, 0); } -EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e93a8545c74d..a010e9d062bf 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * arch/x86/kernel/nmi-selftest.c - * * Testsuite for NMI: IPIs * * Started by Don Zickus: @@ -30,7 +28,6 @@ static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; static int __initdata testcase_total; static int __initdata testcase_successes; -static int __initdata expected_testcase_failures; static int __initdata unexpected_testcase_failures; static int __initdata unexpected_testcase_unknowns; @@ -120,26 +117,22 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk(KERN_CONT "FAILED |"); + pr_cont("FAILED |"); else if (nmi_fail == TIMEOUT) - printk(KERN_CONT "TIMEOUT|"); + pr_cont("TIMEOUT|"); else - printk(KERN_CONT "ERROR |"); + pr_cont("ERROR |"); dump_stack(); } else { testcase_successes++; - printk(KERN_CONT " ok |"); + pr_cont(" ok |"); } - testcase_total++; + pr_cont("\n"); + testcase_total++; reset_nmi(); } -static inline void __init print_testname(const char *testname) -{ - printk("%12s:", testname); -} - void __init nmi_selftest(void) { init_nmi_testsuite(); @@ -147,38 +140,25 @@ void __init nmi_selftest(void) /* * Run the testsuite: */ - printk("----------------\n"); - printk("| NMI testsuite:\n"); - printk("--------------------\n"); + pr_info("----------------\n"); + pr_info("| NMI testsuite:\n"); + pr_info("--------------------\n"); - print_testname("remote IPI"); + pr_info("%12s:", "remote IPI"); dotest(remote_ipi, SUCCESS); - printk(KERN_CONT "\n"); - print_testname("local IPI"); + + pr_info("%12s:", "local IPI"); dotest(local_ipi, SUCCESS); - printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); + pr_info("--------------------\n"); if (unexpected_testcase_failures) { - printk("--------------------\n"); - printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", unexpected_testcase_failures, testcase_total); - printk("-----------------------------------------------------------------\n"); - } else if (expected_testcase_failures && testcase_successes) { - printk("--------------------\n"); - printk("%3d out of %3d testcases failed, as expected. |\n", - expected_testcase_failures, testcase_total); - printk("----------------------------------------------------\n"); - } else if (expected_testcase_failures && !testcase_successes) { - printk("--------------------\n"); - printk("All %3d testcases failed, as expected. |\n", - expected_testcase_failures); - printk("----------------------------------------\n"); } else { - printk("--------------------\n"); - printk("Good, all %3d testcases passed! |\n", + pr_info("Good, all %3d testcases passed! |\n", testcase_successes); - printk("---------------------------------\n"); } + pr_info("-----------------------------------------------------------------\n"); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccd05d8999f..ab3e172dcc69 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -33,6 +33,7 @@ #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> +#include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); @@ -210,12 +211,10 @@ struct paravirt_patch_template pv_ops = { .mmu.set_p4d = native_set_p4d, -#if CONFIG_PGTABLE_LEVELS >= 5 .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 962c3ce39323..c1d2dac72b9c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,7 @@ #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> @@ -52,6 +52,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include <asm/shstk.h> #include "process.h" @@ -93,17 +94,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - /* init_task is not dynamically sized (incomplete FPU state) */ - if (unlikely(src == &init_task)) - memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0); - else - memcpy(dst, src, arch_task_struct_size); + /* fpu_clone() will initialize the "dst_fpu" memory */ + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - /* Drop the copied pointer to current's fpstate */ - dst->thread.fpu.fpstate = NULL; return 0; } @@ -111,8 +107,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { - if (fpu_state_size_dynamic()) - fpstate_free(&tsk->thread.fpu); + if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER))) + fpstate_free(x86_task_fpu(tsk)); } #endif @@ -122,7 +118,6 @@ void arch_release_task_struct(struct task_struct *tsk) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); @@ -130,7 +125,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); shstk_free(tsk); - fpu__drop(fpu); + fpu__drop(tsk); } static int set_new_tls(struct task_struct *p, unsigned long tls) @@ -344,7 +339,7 @@ static void set_cpuid_faulting(bool on) msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) @@ -561,7 +556,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); return; } @@ -578,7 +573,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { @@ -588,7 +583,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } @@ -597,7 +592,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); } #endif @@ -607,7 +602,7 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* @@ -710,11 +705,11 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) arch_has_block_step()) { unsigned long debugctl, msk; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) @@ -907,13 +902,10 @@ static __init bool prefer_mwait_c1_over_halt(void) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - mb(); /* quirk */ - clflush((void *)¤t_thread_info()->flags); - mb(); /* quirk */ - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4636ef359973..9bd4fa694da5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -160,8 +160,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -208,8 +207,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); - switch_fpu_finish(next_p); - /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7196ca7048be..f39ff02e498d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,6 +57,7 @@ #include <asm/unistd.h> #include <asm/fsgsbase.h> #include <asm/fred.h> +#include <asm/msr.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -95,8 +96,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, return; if (mode == SHOW_REGS_USER) { - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; @@ -107,9 +108,9 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_GS_BASE, gs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -195,7 +196,7 @@ static noinstr unsigned long __rdgsbase_inactive(void) native_swapgs(); } else { instrumentation_begin(); - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } @@ -221,7 +222,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) native_swapgs(); } else { instrumentation_begin(); - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } @@ -353,7 +354,7 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } else { if (prev_index != next_index) loadseg(which, next_index); - wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { @@ -463,7 +464,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; @@ -478,7 +479,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase) __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); } } @@ -616,8 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(hardirq_stack_inuse)); - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -671,8 +671,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_p); - /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index b7c0f142d026..4679ac0a03eb 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -27,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev) static void cs5536_warm_reset(struct pci_dev *dev) { /* writing 1 to the LSB of this MSR causes a hard reset */ - wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL); udelay(50); /* shouldn't get here but be safe and spin a while */ } diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c7c4b1917336..57276f134d12 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl %edx, %edi movl $1024, %ecx - rep ; movsl + rep movsl movl %ebp, %edi movl %eax, %esi movl $1024, %ecx - rep ; movsl + rep movsl movl %eax, %edi movl %edx, %esi movl $1024, %ecx - rep ; movsl + rep movsl lea PAGE_SIZE(%ebp), %esi jmp 0b diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ac058971a382..ea604f4d0b52 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -39,6 +39,8 @@ SYM_DATA(kexec_va_control_page, .quad 0) SYM_DATA(kexec_pa_table_page, .quad 0) SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) +SYM_DATA(kexec_debug_8250_mmio32, .quad 0) +SYM_DATA(kexec_debug_8250_port, .word 0) .balign 16 SYM_DATA_START_LOCAL(kexec_debug_gdt) @@ -50,6 +52,11 @@ SYM_DATA_START_LOCAL(kexec_debug_gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + .balign 8 +SYM_DATA_START(kexec_debug_idt) + .skip 0x100, 0x00 +SYM_DATA_END(kexec_debug_idt) + .section .text..relocate_kernel,"ax"; .code64 SYM_CODE_START_NOALIGN(relocate_kernel) @@ -72,8 +79,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - /* zero out flags, and disable interrupts */ - pushq $0 + /* Invalidate GDT/IDT, zero out flags */ + pushq $0 + pushq $0 + + lidt (%rsp) + lgdt (%rsp) + addq $8, %rsp popfq /* Switch to the identity mapped page tables */ @@ -139,6 +151,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %ds, %rax movq %rax, %ds + /* Now an IDTR on the stack to load the IDT the kernel created */ + leaq kexec_debug_idt(%rip), %rsi + pushq %rsi + pushw $0xff + lidt (%rsp) + addq $10, %rsp + + //int3 + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below. @@ -342,20 +363,20 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) /* copy source page to swap page */ movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx - rep ; movsq + rep movsq /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx - rep ; movsq + rep movsq /* copy swap page to destination page */ movq %rdx, %rdi movq kexec_pa_swap_page(%rip), %rsi .Lnoswap: movl $512, %ecx - rep ; movsq + rep movsq lea PAGE_SIZE(%rax), %rsi jmp .Lloop @@ -364,3 +385,222 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) ret int3 SYM_CODE_END(swap_pages) + +/* + * Generic 'print character' routine + * - %al: Character to be printed (may clobber %rax) + * - %rdx: MMIO address or port. + */ +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + addw $LSR, %dx + xchg %al, %ah +.Lxmtrdy_loop: + inb %dx, %al + testb $XMTRDY, %al + jnz .Lready + pause + jmp .Lxmtrdy_loop + +.Lready: + subw $LSR, %dx + xchg %al, %ah + outb %al, %dx +pr_char_null: + ANNOTATE_NOENDBR + + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250) + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR +.Lxmtrdy_loop_mmio: + movb (LSR*4)(%rdx), %ah + testb $XMTRDY, %ah + jnz .Lready_mmio + pause + jmp .Lxmtrdy_loop_mmio + +.Lready_mmio: + movb %al, (%rdx) + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250_mmio32) + +/* + * Load pr_char function pointer into %rsi and load %rdx with whatever + * that function wants to see there (typically port/MMIO address). + */ +.macro pr_setup + leaq pr_char_8250(%rip), %rsi + movw kexec_debug_8250_port(%rip), %dx + testw %dx, %dx + jnz 1f + + leaq pr_char_8250_mmio32(%rip), %rsi + movq kexec_debug_8250_mmio32(%rip), %rdx + testq %rdx, %rdx + jnz 1f + + leaq pr_char_null(%rip), %rsi +1: +.endm + +/* Print the nybble in %bl, clobber %rax */ +SYM_CODE_START_LOCAL_NOALIGN(pr_nybble) + UNWIND_HINT_FUNC + movb %bl, %al + nop + andb $0x0f, %al + addb $0x30, %al + cmpb $0x3a, %al + jb 1f + addb $('a' - '0' - 10), %al + ANNOTATE_RETPOLINE_SAFE +1: jmp *%rsi +SYM_CODE_END(pr_nybble) + +SYM_CODE_START_LOCAL_NOALIGN(pr_qword) + UNWIND_HINT_FUNC + movq $16, %rcx +1: rolq $4, %rbx + call pr_nybble + loop 1b + movb $'\n', %al + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi +SYM_CODE_END(pr_qword) + +.macro print_reg a, b, c, d, r + movb $\a, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\b, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\c, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\d, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movq \r, %rbx + call pr_qword +.endm + +SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors) + /* Each of these is 6 bytes. */ +.macro vec_err exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + nop + nop + pushq $\exc + jmp exc_handler +.endm + +.macro vec_noerr exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + pushq $0 + pushq $\exc + jmp exc_handler +.endm + + ANNOTATE_NOENDBR + vec_noerr 0 // #DE + vec_noerr 1 // #DB + vec_noerr 2 // #NMI + vec_noerr 3 // #BP + vec_noerr 4 // #OF + vec_noerr 5 // #BR + vec_noerr 6 // #UD + vec_noerr 7 // #NM + vec_err 8 // #DF + vec_noerr 9 + vec_err 10 // #TS + vec_err 11 // #NP + vec_err 12 // #SS + vec_err 13 // #GP + vec_err 14 // #PF + vec_noerr 15 +SYM_CODE_END(kexec_debug_exc_vectors) + +SYM_CODE_START_LOCAL_NOALIGN(exc_handler) + /* No need for RET mitigations during kexec */ + VALIDATE_UNRET_END + + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdx + pushq %rsi + + /* Stack frame */ +#define EXC_SS 0x58 /* Architectural... */ +#define EXC_RSP 0x50 +#define EXC_EFLAGS 0x48 +#define EXC_CS 0x40 +#define EXC_RIP 0x38 +#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */ +#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */ +#define EXC_RAX 0x20 /* Pushed just above in exc_handler */ +#define EXC_RBX 0x18 +#define EXC_RCX 0x10 +#define EXC_RDX 0x08 +#define EXC_RSI 0x00 + + /* Set up %rdx/%rsi for debug output */ + pr_setup + + /* rip and exception info */ + print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp) + print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp) + print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp) + print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp) + + /* We spilled these to the stack */ + print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp) + print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp) + print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp) + print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp) + print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp) + + /* Other registers untouched */ + print_reg 'r', 'd', 'i', ':', %rdi + print_reg 'r', '8', ' ', ':', %r8 + print_reg 'r', '9', ' ', ':', %r9 + print_reg 'r', '1', '0', ':', %r10 + print_reg 'r', '1', '1', ':', %r11 + print_reg 'r', '1', '2', ':', %r12 + print_reg 'r', '1', '3', ':', %r13 + print_reg 'r', '1', '4', ':', %r14 + print_reg 'r', '1', '5', ':', %r15 + print_reg 'c', 'r', '2', ':', %cr2 + + /* Only return from INT3 */ + cmpq $3, EXC_EXCEPTION(%rsp) + jne .Ldie + + popq %rsi + popq %rdx + popq %rcx + popq %rbx + popq %rax + + addq $16, %rsp + iretq + +.Ldie: + hlt + jmp .Ldie + +SYM_CODE_END(exc_handler) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d2a13b37833..7d9ed79a93c0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -11,6 +11,7 @@ #include <linux/crash_dump.h> #include <linux/dma-map-ops.h> #include <linux/efi.h> +#include <linux/hugetlb.h> #include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> @@ -18,21 +19,19 @@ #include <linux/memblock.h> #include <linux/panic_notifier.h> #include <linux/pci.h> +#include <linux/random.h> #include <linux/root_dev.h> -#include <linux/hugetlb.h> -#include <linux/tboot.h> -#include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> #include <linux/swiotlb.h> -#include <linux/random.h> +#include <linux/tboot.h> +#include <linux/usb/xhci-dbgp.h> +#include <linux/vmalloc.h> #include <uapi/linux/mount.h> #include <xen/xen.h> #include <asm/apic.h> -#include <asm/efi.h> -#include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cacheinfo.h> @@ -47,18 +46,16 @@ #include <asm/mce.h> #include <asm/memtype.h> #include <asm/mtrr.h> -#include <asm/realmode.h> +#include <asm/nmi.h> +#include <asm/numa.h> #include <asm/olpc_ofw.h> #include <asm/pci-direct.h> #include <asm/prom.h> #include <asm/proto.h> +#include <asm/realmode.h> #include <asm/thermal.h> #include <asm/unwind.h> #include <asm/vsyscall.h> -#include <linux/vmalloc.h> -#if defined(CONFIG_X86_LOCAL_APIC) -#include <asm/nmi.h> -#endif /* * max_low_pfn_mapped: highest directly mapped pfn < 4 GB @@ -134,6 +131,7 @@ struct ist_info ist_info; struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +SYM_PIC_ALIAS(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) __visible unsigned long mmu_cr4_features __ro_after_init; @@ -151,6 +149,13 @@ int bootloader_type, bootloader_version; static const struct ctl_table x86_sysctl_table[] = { { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "panic_on_unrecovered_nmi", .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), @@ -185,15 +190,6 @@ static const struct ctl_table x86_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_ACPI_SLEEP) { .procname = "acpi_video_flags", diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 059685612362..2ddf23387c7e 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -173,8 +173,8 @@ static int shstk_setup(void) return PTR_ERR((void *)addr); fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, addr + size); - wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + wrmsrq(MSR_IA32_PL3_SSP, addr + size); + wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; @@ -239,7 +239,7 @@ static unsigned long get_user_shstk_addr(void) fpregs_lock_and_load(); - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); @@ -372,7 +372,7 @@ int setup_signal_shadow_stack(struct ksignal *ksig) return -EFAULT; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -396,7 +396,7 @@ int restore_signal_shadow_stack(void) return err; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -460,7 +460,7 @@ static int wrss_control(bool enable) return 0; fpregs_lock_and_load(); - rdmsrl(MSR_IA32_U_CET, msrval); + rdmsrq(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); @@ -473,7 +473,7 @@ static int wrss_control(bool enable) msrval &= ~CET_WRSS_EN; } - wrmsrl(MSR_IA32_U_CET, msrval); + wrmsrq(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); @@ -492,8 +492,8 @@ static int shstk_disable(void) fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ - wrmsrl(MSR_IA32_U_CET, 0); - wrmsrl(MSR_IA32_PL3_SSP, 0); + wrmsrq(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 5f441039b572..2404233336ab 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -255,7 +255,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); @@ -423,14 +423,14 @@ bool sigaltstack_size_valid(size_t ss_size) if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; - fsize += current->group_leader->thread.fpu.perm.__user_state_size; + fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; - mask = current->group_leader->thread.fpu.perm.__state_perm; + mask = x86_task_fpu(current->group_leader)->perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d6cf1e23c2a3..b90d872aa0c8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,7 +64,7 @@ #include <asm/acpi.h> #include <asm/cacheinfo.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -1188,6 +1188,12 @@ void cpu_disable_common(void) remove_siblinginfo(cpu); + /* + * Stop allowing kernel-mode FPU. This is needed so that if the CPU is + * brought online again, the initial state is not allowed: + */ + this_cpu_write(kernel_fpu_allowed, false); + /* It's now safe to remove this processor from the online map */ lock_vector_lock(); remove_cpu_from_maps(cpu); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index a59c72e77645..378c388d1b31 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, break; case RET: - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; @@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case JCC: if (!func) { func = __static_call_return; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) func = x86_return_thunk; } @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index b8e7abe00b06..708d61743d15 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -4,7 +4,7 @@ */ #include <asm/trace_clock.h> #include <asm/barrier.h> -#include <asm/msr.h> +#include <asm/tsc.h> /* * trace_clock_x86_tsc(): A clock that is just the cycle counter. diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c deleted file mode 100644 index 03ae1caaa878..000000000000 --- a/arch/x86/kernel/tracepoint.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> - */ -#include <linux/jump_label.h> -#include <linux/atomic.h> - -#include <asm/trace/exceptions.h> - -DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); - -int trace_pagefault_reg(void) -{ - static_branch_inc(&trace_pagefault_key); - return 0; -} - -void trace_pagefault_unreg(void) -{ - static_branch_dec(&trace_pagefault_key); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..94c0236963c6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -68,6 +68,7 @@ #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -749,7 +750,7 @@ static bool try_fixup_enqcmd_gp(void) if (current->pasid_activated) return false; - wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; @@ -882,16 +883,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. @@ -1120,9 +1121,9 @@ static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) */ unsigned long debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* @@ -1295,7 +1296,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -1386,11 +1387,11 @@ static bool handle_xfd_event(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; - rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 88e5a4ed9db3..87e749106dda 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -16,7 +16,7 @@ #include <linux/static_key.h> #include <linux/static_call.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> @@ -29,6 +29,7 @@ #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> +#include <asm/msr.h> #include <asm/topology.h> #include <asm/uv/uv.h> #include <asm/sev.h> @@ -1098,7 +1099,7 @@ static void __init detect_art(void) if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset); + rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 4334033658ed..ec3aa340d351 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/msr.h> #include <asm/tsc.h> struct tsc_adjust { @@ -65,12 +66,12 @@ void tsc_verify_tsc_adjust(bool resume) adj->nextcheck = jiffies + HZ; - rdmsrl(MSR_IA32_TSC_ADJUST, curval); + rdmsrq(MSR_IA32_TSC_ADJUST, curval); if (adj->adjusted == curval) return; /* Restore the original value */ - wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted); if (!adj->warned || resume) { pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", @@ -142,7 +143,7 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, if (likely(!tsc_async_resets)) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); + wrmsrq(MSR_IA32_TSC_ADJUST, 0); bootval = 0; } else { pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", @@ -165,7 +166,7 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (check_tsc_unstable()) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); @@ -187,7 +188,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; cur->warned = false; @@ -229,7 +230,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) */ if (bootval != ref->adjusted) { cur->adjusted = ref->adjusted; - wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted); } /* * We have the TSCs forced to be in sync on this package. Skip sync @@ -518,7 +519,7 @@ retry: pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); - wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted); goto retry; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 9194695662b2..6d383839e839 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -840,6 +840,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; + /* x86_nops[insn->length]; same as jmp with .offs = 0 */ + if (insn->length <= ASM_NOP_MAX && + !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + goto setup; + switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ccdc45e5b759..4fa0be732af1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -79,11 +79,13 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack; #define BSS_DECRYPTED \ . = ALIGN(PMD_SIZE); \ __start_bss_decrypted = .; \ + __pi___start_bss_decrypted = .; \ *(.bss..decrypted); \ . = ALIGN(PAGE_SIZE); \ __start_bss_decrypted_unused = .; \ . = ALIGN(PMD_SIZE); \ __end_bss_decrypted = .; \ + __pi___end_bss_decrypted = .; \ #else @@ -128,6 +130,7 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + __pi__text = .; _stext = .; ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) @@ -391,6 +394,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; + __pi__end = .; #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -466,10 +470,18 @@ SECTIONS } /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and + * let's assert that fuller build coverage is more valuable than being able to + * run a COMPILE_TEST kernel. + */ +#ifndef CONFIG_COMPILE_TEST +/* + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif /* needed for Clang - see arch/x86/entry/entry.S */ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); @@ -497,6 +509,16 @@ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); "SRSO function pair won't alias"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline"); +. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart"); +. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); +#endif + +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline"); +#endif + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5e4d4934c0d3..ecd85f4801cc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -21,7 +21,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -236,7 +236,7 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp struct kvm_cpuid_entry2 *entry; u32 base; - for_each_possible_hypervisor_cpuid_base(base) { + for_each_possible_cpuid_base_hypervisor(base) { entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { @@ -1427,8 +1427,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1444,8 +1444,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1763,7 +1761,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..f2b36d32ef40 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + /* * Checking root.hpa is sufficient even when KVM has mirror root. * We can have either: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 63bb77ee1bb1..8d1b632e33d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} - -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; -} static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7cc0564f5f97..21a3b8166242 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,9 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -325,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 699e551ec93b..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..1aa0f07d3a63 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> #include <asm/sev.h> #include "mmu.h" @@ -2933,6 +2934,7 @@ void __init sev_set_cpu_caps(void) void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -3059,6 +3061,15 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; + + if (!sev_enabled) + return; + + /* + * Do both SNP and SEV initialization at KVM module load. + */ + init_args.probe = true; + sev_platform_init(&init_args); } void sev_hardware_unsetup(void) @@ -3074,6 +3085,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -3119,7 +3132,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -3173,9 +3186,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3202,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3290,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..67fee545d42a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -31,6 +31,7 @@ #include <linux/string_choices.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -475,24 +476,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -566,7 +561,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } @@ -579,15 +574,15 @@ static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } @@ -607,9 +602,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -619,7 +611,7 @@ static int svm_enable_virtualization_cpu(void) uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -629,9 +621,9 @@ static int svm_enable_virtualization_cpu(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -652,13 +644,12 @@ static int svm_enable_virtualization_cpu(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -687,9 +678,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -1518,6 +1506,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1595,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -2149,14 +2199,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2167,17 +2216,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2231,6 +2274,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -5036,6 +5083,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5061,6 +5110,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } @@ -5232,7 +5282,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..f16b068c4228 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,6 +335,8 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ccda95e53f62..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, @@ -319,7 +326,7 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ __entry->requests = READ_ONCE(vcpu->requests); \ @@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5504d9e9fd32..d268224227f0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6,6 +6,7 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include "x86.h" #include "cpuid.h" @@ -7202,8 +7203,8 @@ static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 77012b2eca0e..231a9633359c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -13,6 +13,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" @@ -279,9 +280,9 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, local_irq_disable(); if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { if (read) - rdmsrl(index, msr_info->data); + rdmsrq(index, msr_info->data); else - wrmsrl(index, msr_info->data); + wrmsrq(index, msr_info->data); __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); return true; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index ec08fa3caf43..d70e5b90087d 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vmx(vcpu)->pi_desc); @@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); list_add_tail(&vmx->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) @@ -274,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -312,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -334,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + + enable_remapped_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -346,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 9961e07cf071..df1d0cf76947 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -2,6 +2,7 @@ /* Copyright(c) 2021 Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/msr.h> #include <asm/sgx.h> #include "x86.h" @@ -411,16 +412,16 @@ void setup_default_sgx_lepubkeyhash(void) * MSRs exist but are read-only (locked and not writable). */ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || - rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; } else { /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c5766467a61..157c23db22be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -46,6 +46,7 @@ #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> @@ -273,6 +274,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -380,9 +382,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -393,7 +395,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -1063,7 +1065,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, * provide that period, so a CPU could write host's record into * guest's memory. */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); @@ -1192,13 +1194,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1206,13 +1208,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1225,9 +1227,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); + wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } @@ -1248,7 +1250,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1338,7 +1340,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1362,7 +1364,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -1382,7 +1384,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; @@ -1394,7 +1396,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); if (vmx->guest_state_loaded) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } @@ -1403,7 +1405,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); if (vmx->guest_state_loaded) - wrmsrl(MSR_KERNEL_GS_BASE, data); + wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } @@ -2574,7 +2576,7 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; - rdmsrl(msr, allowed); + rdmsrq(msr, allowed); return ctl_opt & allowed; } @@ -2746,7 +2748,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, break; } - rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) @@ -2766,7 +2768,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; - rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + rdmsrq(MSR_IA32_VMX_MISC, misc_msr); vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -2850,7 +2852,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -4391,7 +4393,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { @@ -6745,7 +6747,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -7052,7 +7054,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * the #NM exception. */ if (is_xfd_nm_fault(vcpu)) - rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) @@ -7307,7 +7309,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. @@ -7318,7 +7320,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) - native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } @@ -7358,10 +7360,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && + else if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7700,6 +7706,7 @@ int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -7959,7 +7966,7 @@ static __init u64 vmx_get_perf_capabilities(void) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); @@ -8508,7 +8515,7 @@ __init int vmx_hardware_setup(void) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c841817a914a..5bdb5b854924 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -578,7 +578,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -590,10 +590,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -630,7 +630,7 @@ static void kvm_user_return_msr_cpu_online(void) int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } @@ -644,7 +644,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; @@ -1174,7 +1174,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); + wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (cpu_feature_enabled(X86_FEATURE_PKU) && @@ -1205,7 +1205,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, kvm_host.xss); + wrmsrq(MSR_IA32_XSS, kvm_host.xss); } } @@ -1584,7 +1584,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1618,6 +1618,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1660,7 +1662,7 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(index, data); + rdmsrq_safe(index, data); break; default: return kvm_x86_call(get_feature_msr)(index, data); @@ -3827,7 +3829,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, data); + wrmsrq(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: @@ -3840,7 +3842,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); @@ -4597,7 +4599,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } -static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) { return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } @@ -9736,7 +9738,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * with an exception. PAT[0] is set to WB on RESET and also by the * kernel, i.e. failure indicates a kernel bug or broken firmware. */ - if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("host PAT[0] is not WB\n"); return -EIO; @@ -9770,15 +9772,15 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } - rdmsrl_safe(MSR_EFER, &kvm_host.efer); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, kvm_host.xss); + rdmsrq(MSR_IA32_XSS, kvm_host.xss); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); r = ops->hardware_setup(); if (r != 0) @@ -10974,7 +10976,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) switch_fpu_return(); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -11060,7 +11062,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* * Consume any pending interrupts, including the possible source of @@ -11098,7 +11100,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11492,7 +11495,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; - u32 sync_valid_fields; + u64 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -11786,6 +11789,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11799,6 +11804,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -13552,25 +13559,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13580,9 +13589,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13590,12 +13599,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } @@ -13608,7 +13623,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); @@ -13652,12 +13668,12 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1c50352eb49f..4fa5c4e1ba8a 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -3,6 +3,8 @@ # Makefile for x86 specific library files. # +obj-y += crypto/ + # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n @@ -39,14 +41,14 @@ lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o -crc32-x86-y := crc32-glue.o crc32-pclmul.o +crc32-x86-y := crc32.o crc32-pclmul.o crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o -crc64-x86-y := crc64-glue.o crc64-pclmul.o +crc64-x86-y := crc64.o crc64-pclmul.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o -crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o +crc-t10dif-x86-y := crc-t10dif.o crc16-msb-pclmul.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif.c index f89c335cde3c..db7ce59c31ac 100644 --- a/arch/x86/lib/crc-t10dif-glue.c +++ b/arch/x86/lib/crc-t10dif.c @@ -9,7 +9,7 @@ #include <linux/module.h> #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); @@ -29,7 +29,7 @@ static int __init crc_t10dif_x86_init(void) } return 0; } -arch_initcall(crc_t10dif_x86_init); +subsys_initcall(crc_t10dif_x86_init); static void __exit crc_t10dif_x86_exit(void) { diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32.c index e3f93b17ac3f..d09343e2cea9 100644 --- a/arch/x86/lib/crc32-glue.c +++ b/arch/x86/lib/crc32.c @@ -11,8 +11,8 @@ #include <linux/module.h> #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_crc32); -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); @@ -88,7 +88,7 @@ static int __init crc32_x86_init(void) } return 0; } -arch_initcall(crc32_x86_init); +subsys_initcall(crc32_x86_init); static void __exit crc32_x86_exit(void) { diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64.c index b0e1b719ecbf..351a09f5813e 100644 --- a/arch/x86/lib/crc64-glue.c +++ b/arch/x86/lib/crc64.c @@ -9,7 +9,7 @@ #include <linux/module.h> #include "crc-pclmul-template.h" -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); @@ -39,7 +39,7 @@ static int __init crc64_x86_init(void) } return 0; } -arch_initcall(crc64_x86_init); +subsys_initcall(crc64_x86_init); static void __exit crc64_x86_exit(void) { diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore new file mode 100644 index 000000000000..580c839bb177 --- /dev/null +++ b/arch/x86/lib/crypto/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-x86_64-cryptogams.S diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig new file mode 100644 index 000000000000..5e94cdee492c --- /dev/null +++ b/arch/x86/lib/crypto/Kconfig @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_X86 + bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" + depends on 64BIT + select CRYPTO_LIB_BLAKE2S_GENERIC + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: x86_64 using: + - SSSE3 (Supplemental SSE3) + - AVX-512 (Advanced Vector Extensions-512) + +config CRYPTO_CHACHA20_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + +config CRYPTO_SHA256_X86_64 + tristate + depends on 64BIT + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD + select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile new file mode 100644 index 000000000000..abceca3d31c0 --- /dev/null +++ b/arch/x86/lib/crypto/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o +libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o +chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o +targets += poly1305-x86_64-cryptogams.S + +obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o +sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $< > $@ + +$(obj)/%.S: $(src)/%.pl FORCE + $(call if_changed,perlasm) diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S index b50b35ff1fdb..ac1c845445a4 100644 --- a/arch/x86/crypto/blake2s-core.S +++ b/arch/x86/lib/crypto/blake2s-core.S @@ -29,7 +29,6 @@ SIGMA: .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -#ifdef CONFIG_AS_AVX512 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 .align 64 SIGMA2: @@ -43,7 +42,6 @@ SIGMA2: .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 -#endif /* CONFIG_AS_AVX512 */ .text SYM_FUNC_START(blake2s_compress_ssse3) @@ -174,7 +172,6 @@ SYM_FUNC_START(blake2s_compress_ssse3) RET SYM_FUNC_END(blake2s_compress_ssse3) -#ifdef CONFIG_AS_AVX512 SYM_FUNC_START(blake2s_compress_avx512) vmovdqu (%rdi),%xmm0 vmovdqu 0x10(%rdi),%xmm1 @@ -253,4 +250,3 @@ SYM_FUNC_START(blake2s_compress_avx512) vzeroupper RET SYM_FUNC_END(blake2s_compress_avx512) -#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c index 0313f9673f56..adc296cd17c9 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/lib/crypto/blake2s-glue.c @@ -3,17 +3,15 @@ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ -#include <crypto/internal/blake2s.h> - -#include <linux/types.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/sizes.h> - #include <asm/cpufeature.h> #include <asm/fpu/api.h> #include <asm/processor.h> #include <asm/simd.h> +#include <crypto/internal/blake2s.h> +#include <linux/init.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/sizes.h> asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *block, const size_t nblocks, @@ -41,8 +39,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&blake2s_use_avx512)) + if (static_branch_likely(&blake2s_use_avx512)) blake2s_compress_avx512(state, block, blocks, inc); else blake2s_compress_ssse3(state, block, blocks, inc); @@ -59,8 +56,7 @@ static int __init blake2s_mod_init(void) if (boot_cpu_has(X86_FEATURE_SSSE3)) static_branch_enable(&blake2s_use_ssse3); - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX) && + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) && diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S index f3d8fc018249..f3d8fc018249 100644 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ b/arch/x86/lib/crypto/chacha-avx2-x86_64.S diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S index 259383e1ad44..259383e1ad44 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S index 7111949cd5b9..7111949cd5b9 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c new file mode 100644 index 000000000000..10b2c945f541 --- /dev/null +++ b/arch/x86/lib/crypto/chacha_glue.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha and HChaCha functions (x86_64 optimized) + * + * Copyright (C) 2015 Martin Willi + */ + +#include <asm/simd.h> +#include <crypto/chacha.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> + +asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (static_branch_likely(&chacha_use_avx512vl)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + if (static_branch_likely(&chacha_use_avx2)) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state->x[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 2); + return; + } + } + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state->x[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd)) { + hchacha_block_generic(state, out, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, out, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&chacha_use_simd) || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&chacha_use_simd); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&chacha_use_simd); + + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } + return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl index b9abcd79c1f4..501827254fed 100644 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl @@ -118,6 +118,19 @@ sub declare_function() { } } +sub declare_typed_function() { + my ($name, $align, $nargs) = @_; + if($kernel) { + $code .= "SYM_TYPED_FUNC_START($name)\n"; + $code .= ".L$name:\n"; + } else { + $code .= ".globl $name\n"; + $code .= ".type $name,\@function,$nargs\n"; + $code .= ".align $align\n"; + $code .= "$name:\n"; + } +} + sub end_function() { my ($name) = @_; if($kernel) { @@ -128,7 +141,7 @@ sub end_function() { } $code.=<<___ if $kernel; -#include <linux/linkage.h> +#include <linux/cfi_types.h> ___ if ($avx) { @@ -236,14 +249,14 @@ ___ $code.=<<___ if (!$kernel); .extern OPENSSL_ia32cap_P -.globl poly1305_init_x86_64 -.hidden poly1305_init_x86_64 +.globl poly1305_block_init_arch +.hidden poly1305_block_init_arch .globl poly1305_blocks_x86_64 .hidden poly1305_blocks_x86_64 .globl poly1305_emit_x86_64 .hidden poly1305_emit_x86_64 ___ -&declare_function("poly1305_init_x86_64", 32, 3); +&declare_typed_function("poly1305_block_init_arch", 32, 3); $code.=<<___; xor %eax,%eax mov %rax,0($ctx) # initialize hash value @@ -298,7 +311,7 @@ $code.=<<___; .Lno_key: RET ___ -&end_function("poly1305_init_x86_64"); +&end_function("poly1305_block_init_arch"); &declare_function("poly1305_blocks_x86_64", 32, 4); $code.=<<___; @@ -2811,18 +2824,10 @@ if ($avx>2) { # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame... -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX512\n"; -} - &declare_function("poly1305_blocks_avx512", 32, 4); poly1305_blocks_avxN(1); &end_function("poly1305_blocks_avx512"); -if ($kernel) { - $code .= "#endif\n"; -} - if (!$kernel && $avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. @@ -4113,9 +4118,9 @@ avx_handler: .section .pdata .align 4 - .rva .LSEH_begin_poly1305_init_x86_64 - .rva .LSEH_end_poly1305_init_x86_64 - .rva .LSEH_info_poly1305_init_x86_64 + .rva .LSEH_begin_poly1305_block_init_arch + .rva .LSEH_end_poly1305_block_init_arch + .rva .LSEH_info_poly1305_block_init_arch .rva .LSEH_begin_poly1305_blocks_x86_64 .rva .LSEH_end_poly1305_blocks_x86_64 @@ -4163,10 +4168,10 @@ ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_poly1305_init_x86_64: +.LSEH_info_poly1305_block_init_arch: .byte 9,0,0,0 .rva se_handler - .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 + .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch .LSEH_info_poly1305_blocks_x86_64: .byte 9,0,0,0 diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c new file mode 100644 index 000000000000..b7e78a583e07 --- /dev/null +++ b/arch/x86/lib/crypto/poly1305_glue.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <asm/cpu_device_id.h> +#include <asm/fpu/api.h> +#include <crypto/internal/poly1305.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> +#include <linux/unaligned.h> + +struct poly1305_arch_internal { + union { + struct { + u32 h[5]; + u32 is_base2_26; + }; + u64 hs[3]; + }; + u64 r[2]; + u64 pad; + struct { u32 r2, r1, r4, r3; } rn[9]; +}; + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); +asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, + const u8 *inp, const size_t len, + const u32 padbit); +asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, + const u8 *inp, + const size_t len, const u32 padbit); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, + unsigned int len, u32 padbit) +{ + struct poly1305_arch_internal *ctx = + container_of(&state->h.h, struct poly1305_arch_internal, h); + + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); + + if (!static_branch_likely(&poly1305_use_avx)) { + poly1305_blocks_x86_64(ctx, inp, len, padbit); + return; + } + + do { + const unsigned int bytes = min(len, SZ_4K); + + kernel_fpu_begin(); + if (static_branch_likely(&poly1305_use_avx512)) + poly1305_blocks_avx512(ctx, inp, bytes, padbit); + else if (static_branch_likely(&poly1305_use_avx2)) + poly1305_blocks_avx2(ctx, inp, bytes, padbit); + else + poly1305_blocks_avx(ctx, inp, bytes, padbit); + kernel_fpu_end(); + + len -= bytes; + inp += bytes; + } while (len); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *ctx, + u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) +{ + if (!static_branch_likely(&poly1305_use_avx)) + poly1305_emit_x86_64(ctx, mac, nonce); + else + poly1305_emit_avx(ctx, mac, nonce); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&poly1305_use_avx); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_simd_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && + /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) + static_branch_enable(&poly1305_use_avx512); + return 0; +} +subsys_initcall(poly1305_simd_mod_init); + +static void __exit poly1305_simd_mod_exit(void) +{ +} +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); +MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S index 53de72bdd851..0d7b2c3e45d9 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/lib/crypto/sha256-avx-asm.S @@ -48,7 +48,7 @@ ######################################################################## #include <linux/linkage.h> -#include <linux/cfi_types.h> +#include <linux/objtool.h> ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -341,13 +341,13 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks +## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); ######################################################################## .text -SYM_TYPED_FUNC_START(sha256_transform_avx) +SYM_FUNC_START(sha256_transform_avx) + ANNOTATE_NOENDBR # since this is called only via static_call + pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S index 0bbec1c75cd0..25d3380321ec 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/lib/crypto/sha256-avx2-asm.S @@ -49,7 +49,7 @@ ######################################################################## #include <linux/linkage.h> -#include <linux/cfi_types.h> +#include <linux/objtool.h> ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -518,13 +518,13 @@ STACK_SIZE = _CTX + _CTX_SIZE .endm ######################################################################## -## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks +## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); ######################################################################## .text -SYM_TYPED_FUNC_START(sha256_transform_rorx) +SYM_FUNC_START(sha256_transform_rorx) + ANNOTATE_NOENDBR # since this is called only via static_call + pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S index d515a55a3bc1..d3548206cf3d 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/lib/crypto/sha256-ni-asm.S @@ -54,9 +54,9 @@ */ #include <linux/linkage.h> -#include <linux/cfi_types.h> +#include <linux/objtool.h> -#define DIGEST_PTR %rdi /* 1st arg */ +#define STATE_PTR %rdi /* 1st arg */ #define DATA_PTR %rsi /* 2nd arg */ #define NUM_BLKS %rdx /* 3rd arg */ @@ -98,24 +98,20 @@ .endm /* - * Intel SHA Extensions optimized implementation of a SHA-256 update function + * Intel SHA Extensions optimized implementation of a SHA-256 block function * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. + * This function takes a pointer to the current SHA-256 state, a pointer to the + * input data, and the number of 64-byte blocks to process. Once all blocks + * have been processed, the state is updated with the new state. This function + * only processes complete blocks. State initialization, buffering of partial + * blocks, and digest finalization is expected to be handled elsewhere. * - * void sha256_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks); - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process + * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], + * const u8 *data, size_t nblocks); */ - .text -SYM_TYPED_FUNC_START(sha256_ni_transform) +SYM_FUNC_START(sha256_ni_transform) + ANNOTATE_NOENDBR # since this is called only via static_call shl $6, NUM_BLKS /* convert to bytes */ jz .Ldone_hash @@ -126,8 +122,8 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) * Need to reorder these appropriately * DCBA, HGFE -> ABEF, CDGH */ - movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ - movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ + movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ + movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ movdqa STATE0, TMP punpcklqdq STATE1, STATE0 /* FEBA */ @@ -166,8 +162,8 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) pshufd $0xB1, STATE0, STATE0 /* HGFE */ pshufd $0x1B, STATE1, STATE1 /* DCBA */ - movdqu STATE1, 0*16(DIGEST_PTR) - movdqu STATE0, 1*16(DIGEST_PTR) + movdqu STATE1, 0*16(STATE_PTR) + movdqu STATE0, 1*16(STATE_PTR) .Ldone_hash: diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S index 93264ee44543..7f24a4cdcb25 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/lib/crypto/sha256-ssse3-asm.S @@ -47,7 +47,7 @@ ######################################################################## #include <linux/linkage.h> -#include <linux/cfi_types.h> +#include <linux/objtool.h> ## assume buffers not aligned #define MOVDQ movdqu @@ -348,15 +348,13 @@ a = TMP_ .endm ######################################################################## -## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, -## int blocks); -## arg 1 : pointer to state -## (struct sha256_state is assumed to begin with u32 state[8]) -## arg 2 : pointer to input data -## arg 3 : Num blocks +## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], +## const u8 *data, size_t nblocks); ######################################################################## .text -SYM_TYPED_FUNC_START(sha256_transform_ssse3) +SYM_FUNC_START(sha256_transform_ssse3) + ANNOTATE_NOENDBR # since this is called only via static_call + pushq %rbx pushq %r12 pushq %r13 diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c new file mode 100644 index 000000000000..80380f8fdcee --- /dev/null +++ b/arch/x86/lib/crypto/sha256.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 optimized for x86_64 + * + * Copyright 2025 Google LLC + */ +#include <asm/fpu/api.h> +#include <crypto/internal/sha2.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/static_call.h> + +asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); +asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); + +DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); + +void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + if (static_branch_likely(&have_sha256_x86)) { + kernel_fpu_begin(); + static_call(sha256_blocks_x86)(state, data, nblocks); + kernel_fpu_end(); + } else { + sha256_blocks_generic(state, data, nblocks); + } +} +EXPORT_SYMBOL_GPL(sha256_blocks_simd); + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + sha256_blocks_generic(state, data, nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return static_key_enabled(&have_sha256_x86); +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +static int __init sha256_x86_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) { + static_call_update(sha256_blocks_x86, sha256_ni_transform); + } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | + XFEATURE_MASK_YMM, NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha256_blocks_x86, + sha256_transform_rorx); + else + static_call_update(sha256_blocks_x86, + sha256_transform_avx); + } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { + return 0; + } + static_branch_enable(&have_sha256_x86); + return 0; +} +subsys_initcall(sha256_x86_mod_init); + +static void __exit sha256_x86_mod_exit(void) +{ +} +module_exit(sha256_x86_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e86eda2c0b04..eb2d2e1cbddd 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -75,7 +75,7 @@ static void delay_tsc(u64 cycles) /* Allow RT tasks to run */ preempt_enable(); - rep_nop(); + native_pause(); preempt_disable(); /* diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 98631c0e7a11..4e385cbfd444 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -13,6 +13,7 @@ #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/ldt.h> +#include <asm/msr.h> #include <asm/vm86.h> #undef pr_fmt @@ -631,14 +632,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel) /* Bits [15:3] contain the index of the desired entry. */ sel >>= 3; - mutex_lock(¤t->active_mm->context.lock); - ldt = current->active_mm->context.ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return false; + + mutex_lock(¤t->mm->context.lock); + ldt = current->mm->context.ldt; if (ldt && sel < ldt->nr_entries) { *out = ldt->entries[sel]; success = true; } - mutex_unlock(¤t->active_mm->context.lock); + mutex_unlock(¤t->mm->context.lock); return success; } @@ -702,16 +710,16 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) unsigned long base; if (seg_reg_idx == INAT_SEG_REG_FS) { - rdmsrl(MSR_FS_BASE, base); + rdmsrq(MSR_FS_BASE, base); } else if (seg_reg_idx == INAT_SEG_REG_GS) { /* * swapgs was called at the kernel entry point. Thus, * MSR_KERNEL_GS_BASE will have the user-space GS base. */ if (user_mode(regs)) - rdmsrl(MSR_KERNEL_GS_BASE, base); + rdmsrq(MSR_KERNEL_GS_BASE, base); else - rdmsrl(MSR_GS_BASE, base); + rdmsrq(MSR_GS_BASE, base); } else { base = 0; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 6ffb931b9fb1..149a57e334ab 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -324,6 +324,11 @@ int insn_get_opcode(struct insn *insn) } insn->attr = inat_get_opcode_attribute(op); + if (insn->x86_64 && inat_is_invalid64(insn->attr)) { + /* This instruction is invalid, like UD2. Stop decoding. */ + insn->attr &= INAT_INV64; + } + while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); @@ -337,6 +342,7 @@ int insn_get_opcode(struct insn *insn) insn->attr = 0; return -EINVAL; } + end: opcode->got = 1; return 0; @@ -658,7 +664,6 @@ int insn_get_immediate(struct insn *insn) } if (!inat_has_immediate(insn->attr)) - /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 5eecb45d05d5..c20e04764edc 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -10,7 +10,7 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n) { unsigned long d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "testb $2,%b4\n\t" "je 1f\n\t" "movsw\n" diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index a58f451a7dd3..b5893928d55c 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -8,7 +8,7 @@ */ #include <asm/asm.h> #include <asm/kaslr.h> -#include <asm/msr.h> +#include <asm/tsc.h> #include <asm/archrandom.h> #include <asm/e820/api.h> #include <asm/shared/io.h> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 0ae2e1712e2e..12a23fa7c44c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -41,6 +41,7 @@ SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) +SYM_PIC_ALIAS(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_START_LOCAL(memcpy_orig) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index d66b710d628f..fb5a03cf5ab7 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -42,6 +42,7 @@ SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) +SYM_PIC_ALIAS(memset) EXPORT_SYMBOL(memset) SYM_FUNC_START_LOCAL(memset_orig) diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index acd463d887e1..b8f63419e6ae 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -47,7 +47,7 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) } EXPORT_SYMBOL(rdmsr_on_cpu); -int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; @@ -60,7 +60,7 @@ int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) return err; } -EXPORT_SYMBOL(rdmsrl_on_cpu); +EXPORT_SYMBOL(rdmsrq_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { @@ -78,7 +78,7 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_on_cpu); -int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; @@ -92,7 +92,7 @@ int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return err; } -EXPORT_SYMBOL(wrmsrl_on_cpu); +EXPORT_SYMBOL(wrmsrq_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs, @@ -204,7 +204,7 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_safe_on_cpu); -int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; @@ -218,9 +218,9 @@ int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return err ? err : rv.err; } -EXPORT_SYMBOL(wrmsrl_safe_on_cpu); +EXPORT_SYMBOL(wrmsrq_safe_on_cpu); -int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; @@ -230,7 +230,7 @@ int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) return err; } -EXPORT_SYMBOL(rdmsrl_safe_on_cpu); +EXPORT_SYMBOL(rdmsrq_safe_on_cpu); /* * These variants are significantly slower, but allows control over diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 5a18ecc04a6c..4ef7c6dcbea6 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -41,7 +41,7 @@ static int msr_read(u32 msr, struct msr *m) int err; u64 val; - err = rdmsrl_safe(msr, &val); + err = rdmsrq_safe(msr, &val); if (!err) m->q = val; @@ -58,7 +58,7 @@ static int msr_read(u32 msr, struct msr *m) */ static int msr_write(u32 msr, struct msr *m) { - return wrmsrl_safe(msr, m->q); + return wrmsrq_safe(msr, m->q); } static inline int __flip_bit(u32 msr, u8 bit, bool set) @@ -122,23 +122,23 @@ int msr_clear_bit(u32 msr, u8 bit) EXPORT_SYMBOL_GPL(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS -void do_trace_write_msr(unsigned int msr, u64 val, int failed) +void do_trace_write_msr(u32 msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); -void do_trace_read_msr(unsigned int msr, u64 val, int failed) +void do_trace_read_msr(u32 msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_read_msr); EXPORT_TRACEPOINT_SYMBOL(read_msr); -void do_trace_rdpmc(unsigned counter, u64 val, int failed) +void do_trace_rdpmc(u32 msr, u64 val, int failed) { - trace_rdpmc(counter, val, failed); + trace_rdpmc(msr, val, failed); } EXPORT_SYMBOL(do_trace_rdpmc); EXPORT_TRACEPOINT_SYMBOL(rdpmc); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a26c43abd47d..d78d769a02bd 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -40,6 +40,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \ __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE) +SYM_PIC_ALIAS(__x86_indirect_thunk_\reg) .endm @@ -367,6 +368,54 @@ SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(its_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(its_return_thunk) +EXPORT_SYMBOL(its_return_thunk) + +#endif /* CONFIG_MITIGATION_ITS */ + /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. @@ -394,6 +443,7 @@ SYM_CODE_START(__x86_return_thunk) #endif int3 SYM_CODE_END(__x86_return_thunk) +SYM_PIC_ALIAS(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_MITIGATION_RETHUNK */ diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 53b3f202267c..f87ec24fa579 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -40,8 +40,7 @@ char *strncpy(char *dest, const char *src, size_t count) "stosb\n\t" "testb %%al,%%al\n\t" "jne 1b\n\t" - "rep\n\t" - "stosb\n" + "rep stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) : "0" (src), "1" (dest), "2" (count) : "memory"); @@ -54,8 +53,7 @@ EXPORT_SYMBOL(strncpy); char *strcat(char *dest, const char *src) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n" "1:\tlodsb\n\t" "stosb\n\t" @@ -72,8 +70,7 @@ EXPORT_SYMBOL(strcat); char *strncat(char *dest, const char *src, size_t count) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n\t" "movl %8,%3\n" "1:\tdecl %3\n\t" @@ -167,8 +164,7 @@ size_t strlen(const char *s) { int d0; size_t res; - asm volatile("repne\n\t" - "scasb" + asm volatile("repne scasb" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); @@ -184,8 +180,7 @@ void *memchr(const void *cs, int c, size_t count) void *res; if (!count) return NULL; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" @@ -202,7 +197,7 @@ void *memscan(void *addr, int c, size_t size) { if (!size) return addr; - asm volatile("repnz; scasb\n\t" + asm volatile("repnz scasb\n\t" "jnz 1f\n\t" "dec %%edi\n" "1:" diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 38f37df056f7..28267985e85f 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -8,16 +8,14 @@ int d0, d1; register char *__res; __asm__ __volatile__( "movl %6,%%edi\n\t" - "repne\n\t" - "scasb\n\t" + "repne scasb\n\t" "notl %%ecx\n\t" "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */ "movl %%ecx,%%edx\n" "1:\tmovl %6,%%edi\n\t" "movl %%esi,%%eax\n\t" "movl %%edx,%%ecx\n\t" - "repe\n\t" - "cmpsb\n\t" + "repe cmpsb\n\t" "je 2f\n\t" /* also works for empty string, see above */ "xchgl %%eax,%%esi\n\t" "incl %%esi\n\t" diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 422257c350c6..f6f436f1d573 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -38,9 +38,9 @@ do { \ might_fault(); \ __asm__ __volatile__( \ ASM_STAC "\n" \ - "0: rep; stosl\n" \ + "0: rep stosl\n" \ " movl %2,%0\n" \ - "1: rep; stosb\n" \ + "1: rep stosb\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \ _ASM_EXTABLE_UA(1b, 2b) \ @@ -140,9 +140,9 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "99: rep; movsl\n" + "99: rep movsl\n" "36: movl %%eax, %0\n" - "37: rep; movsb\n" + "37: rep movsb\n" "100:\n" _ASM_EXTABLE_UA(1b, 100b) _ASM_EXTABLE_UA(2b, 100b) @@ -242,9 +242,9 @@ static unsigned long __copy_user_intel_nocache(void *to, " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "6: rep; movsl\n" + "6: rep movsl\n" " movl %%eax,%0\n" - "7: rep; movsb\n" + "7: rep movsb\n" "8:\n" _ASM_EXTABLE_UA(0b, 8b) _ASM_EXTABLE_UA(1b, 8b) @@ -293,14 +293,14 @@ do { \ " negl %0\n" \ " andl $7,%0\n" \ " subl %0,%3\n" \ - "4: rep; movsb\n" \ + "4: rep movsb\n" \ " movl %3,%0\n" \ " shrl $2,%0\n" \ " andl $3,%3\n" \ " .align 2,0x90\n" \ - "0: rep; movsl\n" \ + "0: rep movsl\n" \ " movl %3,%0\n" \ - "1: rep; movsb\n" \ + "1: rep movsb\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index caedb3ef6688..262f7ca1fb95 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -35,7 +35,7 @@ # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. # -# REX2 Prefix +# REX2 Prefix Superscripts # - (!REX2): REX2 is not allowed # - (REX2): REX2 variant e.g. JMPABS @@ -147,7 +147,7 @@ AVXcode: # 0x60 - 0x6f 60: PUSHA/PUSHAD (i64) 61: POPA/POPAD (i64) -62: BOUND Gv,Ma (i64) | EVEX (Prefix) +62: BOUND Gv,Ma (i64) | EVEX (Prefix),(o64) 63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) 64: SEG=FS (Prefix) 65: SEG=GS (Prefix) @@ -253,8 +253,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) -c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix),(o64) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix),(o64) c6: Grp11A Eb,Ib (1A) c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib @@ -286,10 +286,10 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) -e1: LOOPE/LOOPZ Jb (f64) (!REX2) -e2: LOOP Jb (f64) (!REX2) -e3: JrCXZ Jb (f64) (!REX2) +e0: LOOPNE/LOOPNZ Jb (f64),(!REX2) +e1: LOOPE/LOOPZ Jb (f64),(!REX2) +e2: LOOP Jb (f64),(!REX2) +e3: JrCXZ Jb (f64),(!REX2) e4: IN AL,Ib (!REX2) e5: IN eAX,Ib (!REX2) e6: OUT Ib,AL (!REX2) @@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2) # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) (!REX2) -e9: JMP-near Jz (f64) (!REX2) -ea: JMP-far Ap (i64) (!REX2) -eb: JMP-short Jb (f64) (!REX2) +e8: CALL Jz (f64),(!REX2) +e9: JMP-near Jz (f64),(!REX2) +ea: JMP-far Ap (i64),(!REX2) +eb: JMP-short Jb (f64),(!REX2) ec: IN AL,DX (!REX2) ed: IN eAX,DX (!REX2) ee: OUT DX,AL (!REX2) @@ -478,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) (!REX2) -81: JNO Jz (f64) (!REX2) -82: JB/JC/JNAE Jz (f64) (!REX2) -83: JAE/JNB/JNC Jz (f64) (!REX2) -84: JE/JZ Jz (f64) (!REX2) -85: JNE/JNZ Jz (f64) (!REX2) -86: JBE/JNA Jz (f64) (!REX2) -87: JA/JNBE Jz (f64) (!REX2) -88: JS Jz (f64) (!REX2) -89: JNS Jz (f64) (!REX2) -8a: JP/JPE Jz (f64) (!REX2) -8b: JNP/JPO Jz (f64) (!REX2) -8c: JL/JNGE Jz (f64) (!REX2) -8d: JNL/JGE Jz (f64) (!REX2) -8e: JLE/JNG Jz (f64) (!REX2) -8f: JNLE/JG Jz (f64) (!REX2) +80: JO Jz (f64),(!REX2) +81: JNO Jz (f64),(!REX2) +82: JB/JC/JNAE Jz (f64),(!REX2) +83: JAE/JNB/JNC Jz (f64),(!REX2) +84: JE/JZ Jz (f64),(!REX2) +85: JNE/JNZ Jz (f64),(!REX2) +86: JBE/JNA Jz (f64),(!REX2) +87: JA/JNBE Jz (f64),(!REX2) +88: JS Jz (f64),(!REX2) +89: JNS Jz (f64),(!REX2) +8a: JP/JPE Jz (f64),(!REX2) +8b: JNP/JPO Jz (f64),(!REX2) +8c: JL/JNGE Jz (f64),(!REX2) +8d: JNL/JGE Jz (f64),(!REX2) +8e: JLE/JNG Jz (f64),(!REX2) +8f: JNLE/JG Jz (f64),(!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -996,8 +996,8 @@ AVXcode: 4 83: Grp1 Ev,Ib (1A),(es) # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, # CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ -84: CTESTSCC (ev) -85: CTESTSCC (es) | CTESTSCC (66),(es) +84: CTESTSCC Eb,Gb (ev) +85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es) 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) 8f: POP2 Bq,Rq (000),(11B),(ev) a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index d62662bdd460..5f253ae406b6 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft) void finit(void) { - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + fpstate_init_soft(&x86_task_fpu(current)->fpstate->regs.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 91c52ead1226..5034df617740 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -641,7 +641,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; + struct swregs_state *s387 = &x86_task_fpu(target)->fpstate->regs.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -692,7 +692,7 @@ int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; + struct swregs_state *s387 = &x86_task_fpu(target)->fpstate->regs.soft; const void *space = s387->st_space; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index eec3e4805c75..5e238e930fe3 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d) return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; } -#define I387 (¤t->thread.fpu.fpstate->regs) +#define I387 (&x86_task_fpu(current)->fpstate->regs) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 32035d5be5a0..5b9908f13dcf 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -3,12 +3,10 @@ KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n -KCOV_INSTRUMENT_mem_encrypt_identity.o := n KCOV_INSTRUMENT_pgprot.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n -KASAN_SANITIZE_mem_encrypt_identity.o := n KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions @@ -16,12 +14,10 @@ KASAN_SANITIZE_pgprot.o := n KCSAN_SANITIZE := n # Avoid recursion by not calling KMSAN hooks for CEA code. KMSAN_SANITIZE_cpu_entry_area.o := n -KMSAN_SANITIZE_mem_encrypt_identity.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg -CFLAGS_REMOVE_mem_encrypt_identity.o = -pg CFLAGS_REMOVE_pgprot.o = -pg endif @@ -32,9 +28,6 @@ obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_mem_encrypt_identity.o := -fno-stack-protector - -CFLAGS_fault.o := -I $(src)/../include/asm/trace obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o @@ -52,7 +45,7 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o @@ -63,5 +56,4 @@ obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 628833afee37..f980b0eb0105 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -25,7 +25,7 @@ #include <asm/numa.h> #include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> static unsigned char __initdata nodeids[8]; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 51986e8a9d35..bf8dab18be97 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -111,7 +111,7 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup, /* * Handler for when we fail to restore a task's FPU state. We should never get - * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * here because the FPU state of a task using the FPU (struct fpu::fpstate) * should always be valid. However, past bugs have allowed userspace to set * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). * These caused XRSTOR to fail when switching to the task, leaking the FPU diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 296d294142c8..998bd807fc7b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -13,7 +13,6 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ -#include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ @@ -38,7 +37,7 @@ #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS -#include <asm/trace/exceptions.h> +#include <trace/events/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not @@ -1455,9 +1454,6 @@ static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - if (!trace_pagefault_enabled()) - return; - if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else @@ -1496,8 +1492,6 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); - prefetchw(¤t->mm->mmap_lock); - /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bfa444a7dbb0..7456df985d96 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,6 +28,7 @@ #include <asm/text-patching.h> #include <asm/memtype.h> #include <asm/paravirt.h> +#include <asm/mmu_context.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -173,11 +174,7 @@ __ref void *alloc_low_pages(unsigned int num) * randomization is enabled. */ -#ifndef CONFIG_X86_5LEVEL -#define INIT_PGD_PAGE_TABLES 3 -#else #define INIT_PGD_PAGE_TABLES 4 -#endif #ifndef CONFIG_RANDOMIZE_MEMORY #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) @@ -824,31 +821,33 @@ void __init poking_init(void) spinlock_t *ptl; pte_t *ptep; - poking_mm = mm_alloc(); - BUG_ON(!poking_mm); + text_poke_mm = mm_alloc(); + BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ - paravirt_enter_mmap(poking_mm); + paravirt_enter_mmap(text_poke_mm); + + set_notrack_mm(text_poke_mm); /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ - poking_addr = TASK_UNMAPPED_BASE; + text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); - if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) - poking_addr += PAGE_SIZE; + if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad662cc4605c..607d6a2e66e2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,6 +30,7 @@ #include <linux/initrd.h> #include <linux/cpumask.h> #include <linux/gfp.h> +#include <linux/execmem.h> #include <asm/asm.h> #include <asm/bios_ebda.h> @@ -565,7 +566,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" + "Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -612,7 +613,6 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -633,12 +633,6 @@ void __init initmem_init(void) printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); - setup_bootmem_allocator(); -} -#endif /* !CONFIG_NUMA */ - -void __init setup_bootmem_allocator(void) -{ printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); @@ -755,6 +749,8 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b..66330fe4e18c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,6 +34,7 @@ #include <linux/gfp.h> #include <linux/kcore.h> #include <linux/bootmem_info.h> +#include <linux/execmem.h> #include <asm/processor.h> #include <asm/bios_ebda.h> @@ -805,12 +806,17 @@ kernel_physical_mapping_change(unsigned long paddr_start, } #ifndef CONFIG_NUMA -void __init initmem_init(void) +static inline void x86_numa_init(void) { memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); } #endif +void __init initmem_init(void) +{ + x86_numa_init(); +} + void __init paging_init(void) { sparse_init(); @@ -827,7 +833,6 @@ void __init paging_init(void) zone_sizes_init(); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP #define PAGE_UNUSED 0xFD /* @@ -926,7 +931,6 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long if (!IS_ALIGNED(end, PMD_SIZE)) unused_pmd_start = end; } -#endif /* * Memory hotplug specific functions @@ -1146,16 +1150,13 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); pages++; - } -#ifdef CONFIG_SPARSEMEM_VMEMMAP - else if (vmemmap_pmd_is_unused(addr, next)) { + } else if (vmemmap_pmd_is_unused(addr, next)) { free_hugepage_table(pmd_page(*pmd), altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } -#endif continue; } @@ -1391,6 +1392,8 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + execmem_cache_make_ro(); + kernel_set_to_readonly = 1; /* @@ -1492,7 +1495,6 @@ unsigned long memory_block_size_bytes(void) return memory_block_size_probed; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ @@ -1639,4 +1641,3 @@ void __meminit vmemmap_populate_print_last(void) node_start = 0; } } -#endif diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 7490ff6d83b1..faf3a13fb6ba 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -40,7 +40,9 @@ * section is later cleared. */ u64 sme_me_mask __section(".data") = 0; +SYM_PIC_ALIAS(sme_me_mask); u64 sev_status __section(".data") = 0; +SYM_PIC_ALIAS(sev_status); u64 sev_check_data __section(".data") = 0; EXPORT_SYMBOL(sme_me_mask); diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 3f37b5c80bb3..097aadc250f7 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -25,4 +25,8 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); extern unsigned long tlb_single_page_flush_ceiling; +#ifdef CONFIG_NUMA +void __init x86_numa_init(void); +#endif + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 64e5cdb2460a..c24890c40138 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -18,9 +18,10 @@ #include <asm/e820/api.h> #include <asm/proto.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/numa.h> +#include <asm/amd/nb.h> -#include "numa_internal.h" +#include "mm_internal.h" int numa_off; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c deleted file mode 100644 index 65fda406e6f2..000000000000 --- a/arch/x86/mm/numa_32.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation - * August 2002: added remote node KVA remap - Martin J. Bligh - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/memblock.h> -#include <linux/init.h> -#include <linux/vmalloc.h> -#include <asm/pgtable_areas.h> - -#include "numa_internal.h" - -extern unsigned long highend_pfn, highstart_pfn; - -void __init initmem_init(void) -{ - x86_numa_init(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) - highstart_pfn = max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", - max_low_pfn, highstart_pfn); - - printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - - printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - - __vmalloc_start_set = true; - setup_bootmem_allocator(); -} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c deleted file mode 100644 index 59d80160fa5a..000000000000 --- a/arch/x86/mm/numa_64.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Generic VM initialization for x86-64 NUMA setups. - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include <linux/memblock.h> - -#include "numa_internal.h" - -void __init initmem_init(void) -{ - x86_numa_init(); -} diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h deleted file mode 100644 index 11e1ff370c10..000000000000 --- a/arch/x86/mm/numa_internal.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __X86_MM_NUMA_INTERNAL_H -#define __X86_MM_NUMA_INTERNAL_H - -#include <linux/types.h> -#include <asm/numa.h> - -void __init x86_numa_init(void); - -#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 72d8cbc61158..c97b527c66fe 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/pfn_t.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> @@ -232,7 +233,7 @@ void pat_cpu_init(void) panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } - wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); + wrmsrq(MSR_IA32_CR_PAT, pat_msr_val); __flush_tlb_all(); } @@ -256,7 +257,7 @@ void __init pat_bp_init(void) if (!cpu_feature_enabled(X86_FEATURE_PAT)) pat_disable("PAT not supported by the CPU."); else - rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); + rdmsrq(MSR_IA32_CR_PAT, pat_msr_val); if (!pat_msr_val) { pat_disable("PAT support disabled by the firmware."); @@ -682,6 +683,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr) /** * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type * of @pfn cannot be overridden by UC MTRR memory type. + * @pfn: The page frame number to check. * * Only to be called when PAT is enabled. * @@ -773,38 +775,14 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } -#ifdef CONFIG_STRICT_DEVMEM -/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - return 1; -} -#else -/* This check is needed to avoid cache aliasing when PAT is enabled */ -static inline int range_is_allowed(unsigned long pfn, unsigned long size) -{ - u64 from = ((u64)pfn) << PAGE_SHIFT; - u64 to = from + size; - u64 cursor = from; - - if (!pat_enabled()) - return 1; - - while (cursor < to) { - if (!devmem_is_allowed(pfn)) - return 0; - cursor += PAGE_SIZE; - pfn++; - } - return 1; -} -#endif /* CONFIG_STRICT_DEVMEM */ - int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; + if (!pat_enabled()) + return 1; + if (!range_is_allowed(pfn, size)) return 0; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index def3d9284254..30ab4aced761 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -889,7 +889,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 - if (!SHARED_KERNEL_PMD) { + { struct page *page; list_for_each_entry(page, &pgd_list, lru) { @@ -1293,7 +1293,7 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, /* Queue the page table to be freed after TLB flush */ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); - if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + if (IS_ENABLED(CONFIG_X86_32)) { struct page *page; /* Update all PGD tables to use the same large page */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811..62777ba4de1a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -10,6 +10,7 @@ #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); +SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -68,12 +69,6 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&ptdesc->pt_list); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -#define MAX_UNSHARED_PTRS_PER_PGD \ - MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) - - static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; @@ -86,29 +81,19 @@ struct mm_struct *pgd_page_get_mm(struct page *page) static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (CONFIG_PGTABLE_LEVELS == 2 || - (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS >= 4) { + /* PAE preallocates all its PMDs. No cloning needed. */ + if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - } - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) { - pgd_set_mm(pgd, mm); - pgd_list_add(pgd); - } + /* List used to sync kernel mapping updates */ + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { - if (SHARED_KERNEL_PMD) - return; - spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); @@ -132,15 +117,15 @@ static void pgd_dtor(pgd_t *pgd) * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. */ -#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD -#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD +#define PREALLOCATED_PMDS PTRS_PER_PGD /* + * "USER_PMDS" are the PMDs for the user copy of the page tables when + * PTI is enabled. They do not exist when PTI is disabled. Note that + * this is distinct from the user _portion_ of the kernel page tables + * which always exists. + * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. @@ -169,7 +154,6 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 -#define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ @@ -318,82 +302,28 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, { } #endif -/* - * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also - * assumes that pgd should be in one page. - * - * But kernel with PAE paging that is not running as a Xen domain - * only needs to allocate 32 bytes for pgd instead of one page. - */ -#ifdef CONFIG_X86_PAE - -#include <linux/slab.h> - -#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) -#define PGD_ALIGN 32 - -static struct kmem_cache *pgd_cache; - -void __init pgtable_cache_init(void) -{ - /* - * When PAE kernel is running as a Xen domain, it does not use - * shared kernel pmd. And this requires a whole page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return; - - /* - * when PAE kernel is not running as a Xen domain, it uses - * shared kernel pmd. Shared kernel pmd does not require a whole - * page for pgd. We are able to just allocate a 32-byte for pgd. - * During boot time, we create a 32-byte slab for pgd table allocation. - */ - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, - SLAB_PANIC, NULL); -} static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* - * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. - * We allocate one page for pgd. - */ - if (!SHARED_KERNEL_PMD) - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); - - /* - * Now PAE kernel is not running as a Xen domain. We can allocate - * a 32-byte slab for pgd to save memory space. + * PTI and Xen need a whole page for the PAE PGD + * even though the hardware only needs 32 bytes. + * + * For simplicity, allocate a page for all users. */ - return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); -} - -static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - if (!SHARED_KERNEL_PMD) - __pgd_free(mm, pgd); - else - kmem_cache_free(pgd_cache, pgd); -} -#else - -static inline pgd_t *_pgd_alloc(struct mm_struct *mm) -{ - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } -#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; - pmd_t *pmds[MAX_PREALLOCATED_PMDS]; + pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); @@ -613,11 +543,11 @@ pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, #endif /** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve + * reserve_top_address - Reserve a hole in the top of the kernel address space + * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. + * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { @@ -662,9 +592,12 @@ void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -#ifdef CONFIG_X86_5LEVEL +#if CONFIG_PGTABLE_LEVELS > 4 /** - * p4d_set_huge - setup kernel P4D mapping + * p4d_set_huge - Set up kernel P4D mapping + * @p4d: Pointer to the P4D entry + * @addr: Virtual address associated with the P4D entry + * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ @@ -674,9 +607,10 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) } /** - * p4d_clear_huge - clear kernel P4D mapping when it is set + * p4d_clear_huge - Clear kernel P4D mapping when it is set + * @p4d: Pointer to the P4D entry to clear * - * No 512GB pages yet -- always return 0 + * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { @@ -684,7 +618,10 @@ void p4d_clear_huge(p4d_t *p4d) #endif /** - * pud_set_huge - setup kernel PUD mapping + * pud_set_huge - Set up kernel PUD mapping + * @pud: Pointer to the PUD entry + * @addr: Virtual address associated with the PUD entry + * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR @@ -715,7 +652,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) } /** - * pmd_set_huge - setup kernel PMD mapping + * pmd_set_huge - Set up kernel PMD mapping + * @pmd: Pointer to the PMD entry + * @addr: Virtual address associated with the PMD entry + * @prot: Protection bits to use * * See text over pud_set_huge() above. * @@ -744,7 +684,8 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) } /** - * pud_clear_huge - clear kernel PUD mapping when it is set + * pud_clear_huge - Clear kernel PUD mapping when it is set + * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ @@ -759,7 +700,8 @@ int pud_clear_huge(pud_t *pud) } /** - * pmd_clear_huge - clear kernel PMD mapping when it is set + * pmd_clear_huge - Clear kernel PMD mapping when it is set + * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ @@ -775,11 +717,11 @@ int pmd_clear_huge(pmd_t *pmd) #ifdef CONFIG_X86_64 /** - * pud_free_pmd_page - Clear pud entry and free pmd page. - * @pud: Pointer to a PUD. - * @addr: Virtual address associated with pud. + * pud_free_pmd_page - Clear PUD entry and free PMD page + * @pud: Pointer to a PUD + * @addr: Virtual address associated with PUD * - * Context: The pud range has been unmapped and TLB purged. + * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. @@ -822,11 +764,11 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } /** - * pmd_free_pte_page - Clear pmd entry and free pte page. - * @pmd: Pointer to a PMD. - * @addr: Virtual address associated with pmd. + * pmd_free_pte_page - Clear PMD entry and free PTE page. + * @pmd: Pointer to the PMD + * @addr: Virtual address associated with PMD * - * Context: The pmd range has been unmapped and TLB purged. + * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) @@ -848,7 +790,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) /* * Disable free page handling on x86-PAE. This assures that ioremap() - * does not update sync'd pmd entries. See vmalloc_sync_one(). + * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5f0d579932c6..190299834011 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } - BUILD_BUG_ON(pgd_leaf(*pgd) != 0); + BUILD_BUG_ON(pgd_leaf(*pgd)); return p4d_offset(pgd, address); } @@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!p4d) return NULL; - BUILD_BUG_ON(p4d_leaf(*p4d) != 0); + BUILD_BUG_ON(p4d_leaf(*p4d)); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e459d97ef397..39f80111e6f1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -19,6 +19,7 @@ #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlb.h> @@ -215,16 +216,20 @@ static void clear_asid_other(void) atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +struct new_asid { + unsigned int asid : 16; + unsigned int need_flush : 1; +}; -static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - u16 *new_asid, bool *need_flush) +static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen) { + struct new_asid ns; u16 asid; if (!static_cpu_has(X86_FEATURE_PCID)) { - *new_asid = 0; - *need_flush = true; - return; + ns.asid = 0; + ns.need_flush = 1; + return ns; } /* @@ -235,9 +240,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 global_asid = mm_global_asid(next); if (global_asid) { - *new_asid = global_asid; - *need_flush = false; - return; + ns.asid = global_asid; + ns.need_flush = 0; + return ns; } } @@ -249,22 +254,23 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, next->context.ctx_id) continue; - *new_asid = asid; - *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < - next_tlb_gen); - return; + ns.asid = asid; + ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen); + return ns; } /* * We don't currently own an ASID slot on this CPU. * Allocate a slot. */ - *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; - if (*new_asid >= TLB_NR_DYN_ASIDS) { - *new_asid = 0; + ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (ns.asid >= TLB_NR_DYN_ASIDS) { + ns.asid = 0; this_cpu_write(cpu_tlbstate.next_asid, 1); } - *need_flush = true; + ns.need_flush = true; + + return ns; } /* @@ -623,7 +629,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, { /* Flush L1D if the outgoing task requests it */ if (prev_mm & LAST_USER_MM_L1D_FLUSH) - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); /* Check whether the incoming task opted in for L1D flush */ if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) @@ -667,9 +673,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the @@ -781,9 +787,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); unsigned long new_lam; + struct new_asid ns; u64 next_tlb_gen; - bool need_flush; - u16 new_asid; + /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) @@ -847,14 +853,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + if (IS_ENABLED(CONFIG_DEBUG_VM) && + WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); /* Check if the current mm is transitioning to a global ASID */ if (mm_needs_global_asid(next, prev_asid)) { next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); goto reload_tlb; } @@ -889,8 +896,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ - new_asid = prev_asid; - need_flush = true; + ns.asid = prev_asid; + ns.need_flush = true; } else { /* * Apply process to process speculation vulnerability @@ -899,40 +906,33 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); - /* - * Leave this CPU in prev's mm_cpumask. Atomic writes to - * mm_cpumask can be expensive under contention. The CPU - * will be removed lazily at TLB flush time. - */ - VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, - mm_cpumask(prev))); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + ns = choose_new_asid(next, next_tlb_gen); } reload_tlb: new_lam = mm_lam_cr3_mask(next); - if (need_flush) { - VM_WARN_ON_ONCE(is_global_asid(new_asid)); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, new_lam, true); + if (ns.need_flush) { + VM_WARN_ON_ONCE(is_global_asid(ns.asid)); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, new_lam, false); + load_new_mm_cr3(next->pgd, ns.asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -941,7 +941,7 @@ reload_tlb: barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid); cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { @@ -972,6 +972,77 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) } /* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. Temporary mms can also be used for EFI runtime service + * calls or similar functionality. + * + * It is illegal to schedule while using a temporary mm -- the context switch + * code is unaware of the temporary mm and does not know how to context switch. + * Use a real (non-temporary) mm in a kernel thread if you need to sleep. + * + * Note: For sensitive memory writes, the temporary mm needs to be used + * exclusively by a single core, and IRQs should be disabled while the + * temporary mm is loaded, thereby preventing interrupt handler bugs from + * overriding the kernel memory protection. + */ +struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *prev_mm; + + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(); + + prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, temp_mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return prev_mm; +} + +void unuse_temporary_mm(struct mm_struct *prev_mm) +{ + lockdep_assert_preemption_disabled(); + guard(irqsave)(); + + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(smp_processor_id(), mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm))); + + switch_mm_irqs_off(NULL, prev_mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * @@ -1204,8 +1275,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1293,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f..15672cb926fc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -41,6 +41,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT5(b1, b2, b3, b4, b5) \ + do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0) #define EMIT1_off32(b1, off) \ do { EMIT1(b1); EMIT(off, 4); } while (0) @@ -629,7 +631,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, goto out; ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { - text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + smp_text_poke_single(ip, new_insn, X86_PATCH_SIZE, NULL); ret = 0; } out: @@ -661,7 +663,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { + OPTIMIZER_HIDE_VAR(reg); + emit_jump(&prog, its_static_thunk(reg), ip); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { @@ -683,7 +688,7 @@ static void emit_return(u8 **pprog, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk()) { emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ @@ -1502,6 +1507,48 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) #define PRIV_STACK_GUARD_SZ 8 #define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL +static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + struct bpf_prog *bpf_prog) +{ + u8 *prog = *pprog; + u8 *func; + + if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { + /* The clearing sequence clobbers eax and ecx. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x51); /* push rcx */ + ip += 2; + + func = (u8 *)clear_bhb_loop; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + + if (emit_call(&prog, func, ip)) + return -EINVAL; + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } + /* Insert IBHF instruction */ + if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && + cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || + cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { + /* + * Add an Indirect Branch History Fence (IBHF). IBHF acts as a + * fence preventing branch history from before the fence from + * affecting indirect branches after the fence. This is + * specifically used in cBPF jitted code to prevent Intra-mode + * BHI attacks. The IBHF instruction is designed to be a NOP on + * hardware that doesn't need or support it. The REP and REX.W + * prefixes are required by the microcode, and they also ensure + * that the NOP is unlikely to be used in existing code. + * + * IBHF is not a valid instruction in 32-bit mode. + */ + EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ + } + *pprog = prog; + return 0; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -2544,6 +2591,13 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; + if (bpf_prog_was_classic(bpf_prog) && + !capable(CAP_SYS_ADMIN)) { + u8 *ip = image + addrs[i - 1]; + + if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) + return -EINVAL; + } if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 631512f7ec85..99b1727136c1 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -5,7 +5,7 @@ #include <linux/cpu.h> #include <linux/range.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/pci_x86.h> #include <asm/pci-direct.h> @@ -202,7 +202,7 @@ static int __init early_root_info_init(void) /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; - rdmsrl(address, val); + rdmsrq(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) @@ -293,12 +293,12 @@ static int __init early_root_info_init(void) /* need to take out [4G, TOM2) for RAM*/ /* SYS_CFG */ address = MSR_AMD64_SYSCFG; - rdmsrl(address, val); + rdmsrq(address, val); /* TOP_MEM2 is enabled? */ if (val & (1<<21)) { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); + rdmsrq(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); subtract_range(range, RANGE_NUM, 1ULL<<32, end); @@ -341,10 +341,10 @@ static int amd_bus_cpu_online(unsigned int cpu) { u64 reg; - rdmsrl(MSR_AMD64_NB_CFG, reg); + rdmsrq(MSR_AMD64_NB_CFG, reg); if (!(reg & ENABLE_CF8_EXT_CFG)) { reg |= ENABLE_CF8_EXT_CFG; - wrmsrl(MSR_AMD64_NB_CFG, reg); + wrmsrq(MSR_AMD64_NB_CFG, reg); } return 0; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index efefeb82ab61..36336299596b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -9,7 +9,7 @@ #include <linux/pci.h> #include <linux/suspend.h> #include <linux/vgaarb.h> -#include <asm/amd_node.h> +#include <asm/amd/node.h> #include <asm/hpet.h> #include <asm/pci_x86.h> diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 39255f0eb14d..1f4522325920 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -22,9 +22,10 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/rculist.h> +#include <asm/acpi.h> #include <asm/e820/api.h> +#include <asm/msr.h> #include <asm/pci_x86.h> -#include <asm/acpi.h> /* Indicate if the ECAM resources have been placed into the resource table */ static bool pci_mmcfg_running_state; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac57259a432b..e7e8f77f77f8 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -73,7 +73,7 @@ int __init efi_alloc_page_tables(void) gfp_t gfp_mask; gfp_mask = GFP_KERNEL | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_allocation_order()); if (!efi_pgd) goto fail; @@ -89,6 +89,7 @@ int __init efi_alloc_page_tables(void) efi_mm.pgd = efi_pgd; mm_init_cpumask(&efi_mm); init_new_context(NULL, &efi_mm); + set_notrack_mm(&efi_mm); return 0; @@ -96,7 +97,7 @@ free_p4d: if (pgtable_l5_enabled()) free_page((unsigned long)pgd_page_vaddr(*pgd)); free_pgd: - free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)efi_pgd, pgd_allocation_order()); fail: return -ENOMEM; } @@ -434,15 +435,12 @@ void __init efi_dump_pagetable(void) */ static void efi_enter_mm(void) { - efi_prev_mm = current->active_mm; - current->active_mm = &efi_mm; - switch_mm(efi_prev_mm, &efi_mm, NULL); + efi_prev_mm = use_temporary_mm(&efi_mm); } static void efi_leave_mm(void) { - current->active_mm = efi_prev_mm; - switch_mm(&efi_mm, efi_prev_mm, NULL); + unuse_temporary_mm(efi_prev_mm); } void arch_efi_call_virt_setup(void) diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c index 57f210cda761..ee77d57bcab7 100644 --- a/arch/x86/platform/olpc/olpc-xo1-rtc.c +++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c @@ -64,9 +64,9 @@ static int __init xo1_rtc_init(void) of_node_put(node); pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n"); - rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm); - rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm); - rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century); + rdmsrq(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm); + rdmsrq(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm); + rdmsrq(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century); r = platform_device_register(&xo1_rtc_device); if (r) diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 63066e7c8517..30751b42d54e 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -325,7 +325,7 @@ static int setup_sci_interrupt(struct platform_device *pdev) dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n"); sci_irq = 3; lo |= 0x00300000; - wrmsrl(0x51400020, lo); + wrmsrq(0x51400020, lo); } /* Select level triggered in PIC */ diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index cfa18ec7d55f..1d78e5631bb8 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -87,8 +87,7 @@ SYM_CODE_START(pvh_start_xen) mov %ebx, %esi movl rva(pvh_start_info_sz)(%ebp), %ecx shr $2,%ecx - rep - movsl + rep movsl leal rva(early_stack_end)(%ebp), %esp diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 08e76a5ca155..916441f5e85c 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -27,6 +27,7 @@ #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> #include <asm/microcode.h> +#include <asm/msr.h> #include <asm/fred.h> #ifdef CONFIG_X86_32 @@ -44,7 +45,7 @@ static void msr_save_context(struct saved_context *ctxt) while (msr < end) { if (msr->valid) - rdmsrl(msr->info.msr_no, msr->info.reg.q); + rdmsrq(msr->info.msr_no, msr->info.reg.q); msr++; } } @@ -56,7 +57,7 @@ static void msr_restore_context(struct saved_context *ctxt) while (msr < end) { if (msr->valid) - wrmsrl(msr->info.msr_no, msr->info.reg.q); + wrmsrq(msr->info.msr_no, msr->info.reg.q); msr++; } } @@ -110,12 +111,12 @@ static void __save_processor_state(struct saved_context *ctxt) savesegment(ds, ctxt->ds); savesegment(es, ctxt->es); - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); + rdmsrq(MSR_FS_BASE, ctxt->fs_base); + rdmsrq(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); - rdmsrl(MSR_EFER, ctxt->efer); + rdmsrq(MSR_EFER, ctxt->efer); #endif /* @@ -125,7 +126,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr2 = read_cr2(); ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); - ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, + ctxt->misc_enable_saved = !rdmsrq_safe(MSR_IA32_MISC_ENABLE, &ctxt->misc_enable); msr_save_context(ctxt); } @@ -198,7 +199,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) struct cpuinfo_x86 *c; if (ctxt->misc_enable_saved) - wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); + wrmsrq(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* * control registers */ @@ -208,7 +209,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) __write_cr4(ctxt->cr4); #else /* CONFIG X86_64 */ - wrmsrl(MSR_EFER, ctxt->efer); + wrmsrq(MSR_EFER, ctxt->efer); __write_cr4(ctxt->cr4); #endif write_cr3(ctxt->cr3); @@ -231,7 +232,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * handlers or in complicated helpers like load_gs_index(). */ #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + wrmsrq(MSR_GS_BASE, ctxt->kernelmode_gs_base); /* * Reinitialize FRED to ensure the FRED MSRs contain the same values @@ -267,8 +268,8 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * restoring the selectors clobbers the bases. Keep in mind * that MSR_KERNEL_GS_BASE is horribly misnamed. */ - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); + wrmsrq(MSR_FS_BASE, ctxt->fs_base); + wrmsrq(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); #else loadsegment(gs, ctxt->gs); #endif @@ -414,7 +415,7 @@ static int msr_build_context(const u32 *msr_id, const int num) u64 dummy; msr_array[i].info.msr_no = msr_id[j]; - msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy); + msr_array[i].valid = !rdmsrq_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 5b81d19cd114..a7c23f2a58c9 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -42,6 +42,7 @@ unsigned long relocated_restore_code __visible; /** * pfn_is_nosave - check if given pfn is in the 'nosave' section + * @pfn: the page frame number to check. */ int pfn_is_nosave(unsigned long pfn) { @@ -86,7 +87,10 @@ static inline u32 compute_e820_crc32(struct e820_table *table) /** * arch_hibernation_header_save - populate the architecture specific part * of a hibernation image header - * @addr: address to save the data at + * @addr: address where architecture specific header data will be saved. + * @max_size: maximum size of architecture specific data in hibernation header. + * + * Return: 0 on success, -EOVERFLOW if max_size is insufficient. */ int arch_hibernation_header_save(void *addr, unsigned int max_size) { diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 5606a15cf9a1..fb910d9f8471 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -69,8 +69,7 @@ copy_loop: movl pbe_orig_address(%edx), %edi movl $(PAGE_SIZE >> 2), %ecx - rep - movsl + rep movsl movl pbe_next(%edx), %edx jmp copy_loop diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 8c534c36adfa..c73be0a02a6c 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -26,7 +26,7 @@ /* code below belongs to the image kernel */ .align PAGE_SIZE SYM_FUNC_START(restore_registers) - ANNOTATE_NOENDBR + ENDBR /* go back to the original page tables */ movq %r9, %cr3 @@ -120,7 +120,7 @@ SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ SYM_FUNC_START(core_restore_code) - ANNOTATE_NOENDBR + ENDBR /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ @@ -138,8 +138,7 @@ SYM_FUNC_START(core_restore_code) movq pbe_address(%rdx), %rsi movq pbe_orig_address(%rdx), %rdi movq $(PAGE_SIZE >> 3), %rcx - rep - movsq + rep movsq /* progress to the next pbe */ movq pbe_next(%rdx), %rdx diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index f9bc444a3064..ed5c63c0b4e5 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,6 +9,7 @@ #include <asm/realmode.h> #include <asm/tlbflush.h> #include <asm/crash.h> +#include <asm/msr.h> #include <asm/sev.h> struct real_mode_header *real_mode_header; @@ -145,7 +146,7 @@ static void __init setup_real_mode(void) * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR * so we need to mask it out. */ - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); trampoline_header->efer = efer & ~EFER_LMA; trampoline_header->start = (u64) secondary_startup_64; diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5770c8097f32..2c19d7fc8a85 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -64,6 +64,8 @@ BEGIN { modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" + invalid64_expr = "\\(i64\\)" + only64_expr = "\\(o64\\)" rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))" rex2_expr = "\\(REX2\\)" no_rex2_expr = "\\(!REX2\\)" @@ -319,6 +321,11 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, force64_expr)) flags = add_flags(flags, "INAT_FORCE64") + # check invalid in 64-bit (and no only64) + if (match(ext, invalid64_expr) && + !match($0, only64_expr)) + flags = add_flags(flags, "INAT_INV64") + # check REX2 not allowed if (match(ext, no_rex2_expr)) flags = add_flags(flags, "INAT_NO_REX2") diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index ab5c8e47049c..9193a7790a71 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index 26fb4835d3e9..61e4ca1e0ab5 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index fc473ca12c44..942372e69b4d 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -27,9 +27,10 @@ #include <asm/smp.h> #include <asm/cpu.h> #include <asm/apic.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/cmdline.h> #include <asm/iommu.h> +#include <asm/msr.h> /* * The RMP entry information as returned by the RMPREAD instruction. @@ -136,11 +137,11 @@ static int __mfd_enable(unsigned int cpu) if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; - rdmsrl(MSR_AMD64_SYSCFG, val); + rdmsrq(MSR_AMD64_SYSCFG, val); val |= MSR_AMD64_SYSCFG_MFDM; - wrmsrl(MSR_AMD64_SYSCFG, val); + wrmsrq(MSR_AMD64_SYSCFG, val); return 0; } @@ -157,12 +158,12 @@ static int __snp_enable(unsigned int cpu) if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; - rdmsrl(MSR_AMD64_SYSCFG, val); + rdmsrq(MSR_AMD64_SYSCFG, val); val |= MSR_AMD64_SYSCFG_SNP_EN; val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; - wrmsrl(MSR_AMD64_SYSCFG, val); + wrmsrq(MSR_AMD64_SYSCFG, val); return 0; } @@ -522,7 +523,7 @@ int __init snp_rmptable_init(void) * Check if SEV-SNP is already enabled, this can happen in case of * kexec boot. */ - rdmsrl(MSR_AMD64_SYSCFG, val); + rdmsrq(MSR_AMD64_SYSCFG, val); if (val & MSR_AMD64_SYSCFG_SNP_EN) goto skip_enable; @@ -576,8 +577,8 @@ static bool probe_contiguous_rmptable_info(void) { u64 rmp_sz, rmp_base, rmp_end; - rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); - rdmsrl(MSR_AMD64_RMP_END, rmp_end); + rdmsrq(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrq(MSR_AMD64_RMP_END, rmp_end); if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) { pr_err("Memory for the RMP table has not been reserved by BIOS\n"); @@ -610,13 +611,13 @@ static bool probe_segmented_rmptable_info(void) unsigned int eax, ebx, segment_shift, segment_shift_min, segment_shift_max; u64 rmp_base, rmp_end; - rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); + rdmsrq(MSR_AMD64_RMP_BASE, rmp_base); if (!(rmp_base & RMP_ADDR_MASK)) { pr_err("Memory for the RMP table has not been reserved by BIOS\n"); return false; } - rdmsrl(MSR_AMD64_RMP_END, rmp_end); + rdmsrq(MSR_AMD64_RMP_END, rmp_end); WARN_ONCE(rmp_end & RMP_ADDR_MASK, "Segmented RMP enabled but RMP_END MSR is non-zero\n"); @@ -652,7 +653,7 @@ static bool probe_segmented_rmptable_info(void) bool snp_probe_rmptable_info(void) { if (cpu_feature_enabled(X86_FEATURE_SEGMENTED_RMP)) - rdmsrl(MSR_AMD64_RMP_CFG, rmp_cfg); + rdmsrq(MSR_AMD64_RMP_CFG, rmp_cfg); if (rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED) return probe_segmented_rmptable_info(); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 43dcd8c7badc..53282dc7d5ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -70,6 +70,9 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + static __ref void xen_get_vendor(void) { init_cpu_devs(); @@ -100,10 +103,6 @@ noinstr void *__xen_hypercall_setfunc(void) void (*func)(void); /* - * Xen is supported only on CPUs with CPUID, so testing for - * X86_FEATURE_CPUID is a test for early_cpu_init() having been - * run. - * * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty * dependency chain: it is being called via the xen_hypercall static * call when running as a PVH or HVM guest. Hypercalls need to be @@ -115,8 +114,7 @@ noinstr void *__xen_hypercall_setfunc(void) */ instrumentation_begin(); - if (!boot_cpu_has(X86_FEATURE_CPUID)) - xen_get_vendor(); + xen_get_vendor(); if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) @@ -466,6 +464,13 @@ int __init arch_xen_unpopulated_init(struct resource **res) xen_free_unpopulated_pages(1, &pg); } + /* + * Account for the region being in the physmap but unpopulated. + * The value in xen_released_pages is used by the balloon + * driver to know how much of the physmap is unpopulated and + * set an accurate initial memory target. + */ + xen_released_pages += xen_extra_mem[i].n_pfns; /* Zero so region is not also added to the balloon driver. */ xen_extra_mem[i].n_pfns = 0; } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 846b5737d320..26bbaf4b7330 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -49,7 +49,7 @@ #include <xen/hvc-console.h> #include <xen/acpi.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/paravirt.h> #include <asm/apic.h> #include <asm/page.h> @@ -61,6 +61,7 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> @@ -1086,15 +1087,15 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static u64 xen_do_read_msr(unsigned int msr, int *err) +static u64 xen_do_read_msr(u32 msr, int *err) { u64 val = 0; /* Avoid uninitialized value for safe variant. */ - if (pmu_msr_read(msr, &val, err)) + if (pmu_msr_chk_emulated(msr, &val, true)) return val; if (err) - val = native_read_msr_safe(msr, err); + *err = native_read_msr_safe(msr, &val); else val = native_read_msr(msr); @@ -1110,17 +1111,9 @@ static u64 xen_do_read_msr(unsigned int msr, int *err) return val; } -static void set_seg(unsigned int which, unsigned int low, unsigned int high, - int *err) +static void set_seg(u32 which, u64 base) { - u64 base = ((u64)high << 32) | low; - - if (HYPERVISOR_set_segment_base(which, base) == 0) - return; - - if (err) - *err = -EIO; - else + if (HYPERVISOR_set_segment_base(which, base)) WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); } @@ -1129,20 +1122,19 @@ static void set_seg(unsigned int which, unsigned int low, unsigned int high, * With err == NULL write_msr() semantics are selected. * Supplying an err pointer requires err to be pre-initialized with 0. */ -static void xen_do_write_msr(unsigned int msr, unsigned int low, - unsigned int high, int *err) +static void xen_do_write_msr(u32 msr, u64 val, int *err) { switch (msr) { case MSR_FS_BASE: - set_seg(SEGBASE_FS, low, high, err); + set_seg(SEGBASE_FS, val); break; case MSR_KERNEL_GS_BASE: - set_seg(SEGBASE_GS_USER, low, high, err); + set_seg(SEGBASE_GS_USER, val); break; case MSR_GS_BASE: - set_seg(SEGBASE_GS_KERNEL, low, high, err); + set_seg(SEGBASE_GS_KERNEL, val); break; case MSR_STAR: @@ -1158,42 +1150,45 @@ static void xen_do_write_msr(unsigned int msr, unsigned int low, break; default: - if (!pmu_msr_write(msr, low, high, err)) { - if (err) - *err = native_write_msr_safe(msr, low, high); - else - native_write_msr(msr, low, high); - } + if (pmu_msr_chk_emulated(msr, &val, false)) + return; + + if (err) + *err = native_write_msr_safe(msr, val); + else + native_write_msr(msr, val); } } -static u64 xen_read_msr_safe(unsigned int msr, int *err) +static int xen_read_msr_safe(u32 msr, u64 *val) { - return xen_do_read_msr(msr, err); + int err = 0; + + *val = xen_do_read_msr(msr, &err); + return err; } -static int xen_write_msr_safe(unsigned int msr, unsigned int low, - unsigned int high) +static int xen_write_msr_safe(u32 msr, u64 val) { int err = 0; - xen_do_write_msr(msr, low, high, &err); + xen_do_write_msr(msr, val, &err); return err; } -static u64 xen_read_msr(unsigned int msr) +static u64 xen_read_msr(u32 msr) { - int err; + int err = 0; return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); } -static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +static void xen_write_msr(u32 msr, u64 val) { int err; - xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL); + xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL); } /* This is called once we have the cpu_possible_mask */ diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0e3d930bcb89..9d25d9373945 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/export.h> #include <linux/mm.h> @@ -123,8 +125,23 @@ static void __init pvh_arch_setup(void) { pvh_reserve_extra_memory(); - if (xen_initial_domain()) + if (xen_initial_domain()) { xen_add_preferred_consoles(); + + /* + * Disable usage of CPU idle and frequency drivers: when + * running as hardware domain the exposed native ACPI tables + * causes idle and/or frequency drivers to attach and + * malfunction. It's Xen the entity that controls the idle and + * frequency states. + * + * For unprivileged domains the exposed ACPI tables are + * fabricated and don't contain such data. + */ + disable_cpuidle(); + disable_cpufreq(); + WARN_ON(xen_set_default_idle()); + } } void __init xen_pvh_init(struct boot_params *boot_params) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 38971c6dcd4b..2a4a8deaf612 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -578,7 +578,6 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) xen_mc_issue(XEN_LAZY_MMU); } -#if CONFIG_PGTABLE_LEVELS >= 5 __visible p4dval_t xen_p4d_val(p4d_t p4d) { return pte_mfn_to_pfn(p4d.p4d); @@ -592,7 +591,6 @@ __visible p4d_t xen_make_p4d(p4dval_t p4d) return native_make_p4d(p4d); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, void (*func)(struct mm_struct *mm, struct page *, @@ -2222,10 +2220,8 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#if CONFIG_PGTABLE_LEVELS >= 5 .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), -#endif .enter_mmap = xen_enter_mmap, .exit_mmap = xen_exit_mmap, diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 10c660fae8b3..7237d56a9d3f 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -54,14 +54,20 @@ struct mc_debug_data { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); static struct mc_debug_data mc_debug_data_early __initdata; -static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) = - &mc_debug_data_early; static struct mc_debug_data __percpu *mc_debug_data_ptr; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); static struct static_key mc_debug __ro_after_init; static bool mc_debug_enabled __initdata; +static struct mc_debug_data * __ref get_mc_debug(void) +{ + if (!mc_debug_data_ptr) + return &mc_debug_data_early; + + return this_cpu_ptr(mc_debug_data_ptr); +} + static int __init xen_parse_mc_debug(char *arg) { mc_debug_enabled = true; @@ -71,20 +77,16 @@ static int __init xen_parse_mc_debug(char *arg) } early_param("xen_mc_debug", xen_parse_mc_debug); -void mc_percpu_init(unsigned int cpu) -{ - per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu); -} - static int __init mc_debug_enable(void) { unsigned long flags; + struct mc_debug_data __percpu *mcdb; if (!mc_debug_enabled) return 0; - mc_debug_data_ptr = alloc_percpu(struct mc_debug_data); - if (!mc_debug_data_ptr) { + mcdb = alloc_percpu(struct mc_debug_data); + if (!mcdb) { pr_err("xen_mc_debug inactive\n"); static_key_slow_dec(&mc_debug); return -ENOMEM; @@ -93,7 +95,7 @@ static int __init mc_debug_enable(void) /* Be careful when switching to percpu debug data. */ local_irq_save(flags); xen_mc_flush(); - mc_percpu_init(0); + mc_debug_data_ptr = mcdb; local_irq_restore(flags); pr_info("xen_mc_debug active\n"); @@ -155,7 +157,7 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); if (static_key_false(&mc_debug)) { - mcdb = __this_cpu_read(mc_debug_data); + mcdb = get_mc_debug(); memcpy(mcdb->entries, b->entries, b->mcidx * sizeof(struct multicall_entry)); } @@ -235,7 +237,7 @@ struct multicall_space __xen_mc_entry(size_t args) ret.mc = &b->entries[b->mcidx]; if (static_key_false(&mc_debug)) { - struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data); + struct mc_debug_data *mcdb = get_mc_debug(); mcdb->caller[b->mcidx] = __builtin_return_address(0); mcdb->argsz[b->mcidx] = args; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index f06987b0efc3..8f89ce0b67e3 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -2,6 +2,7 @@ #include <linux/types.h> #include <linux/interrupt.h> +#include <asm/msr.h> #include <asm/xen/hypercall.h> #include <xen/xen.h> #include <xen/page.h> @@ -128,7 +129,7 @@ static inline uint32_t get_fam15h_addr(u32 addr) return addr; } -static inline bool is_amd_pmu_msr(unsigned int msr) +static bool is_amd_pmu_msr(u32 msr) { if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -194,8 +195,7 @@ static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index) } } -static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, - int index, bool is_read) +static bool xen_intel_pmu_emulate(u32 msr, u64 *val, int type, int index, bool is_read) { uint64_t *reg = NULL; struct xen_pmu_intel_ctxt *ctxt; @@ -257,7 +257,7 @@ static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, return false; } -static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +static bool xen_amd_pmu_emulate(u32 msr, u64 *val, bool is_read) { uint64_t *reg = NULL; int i, off = 0; @@ -298,55 +298,20 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) return false; } -static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read, - bool *emul) +bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read) { int type, index = 0; if (is_amd_pmu_msr(msr)) - *emul = xen_amd_pmu_emulate(msr, val, is_read); - else if (is_intel_pmu_msr(msr, &type, &index)) - *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read); - else - return false; - - return true; -} - -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) -{ - bool emulated; + return xen_amd_pmu_emulate(msr, val, is_read); - if (!pmu_msr_chk_emulated(msr, val, true, &emulated)) - return false; + if (is_intel_pmu_msr(msr, &type, &index)) + return xen_intel_pmu_emulate(msr, val, type, index, is_read); - if (!emulated) { - *val = err ? native_read_msr_safe(msr, err) - : native_read_msr(msr); - } - - return true; -} - -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) -{ - uint64_t val = ((uint64_t)high << 32) | low; - bool emulated; - - if (!pmu_msr_chk_emulated(msr, &val, false, &emulated)) - return false; - - if (!emulated) { - if (err) - *err = native_write_msr_safe(msr, low, high); - else - native_write_msr(msr, low, high); - } - - return true; + return false; } -static unsigned long long xen_amd_read_pmc(int counter) +static u64 xen_amd_read_pmc(int counter) { struct xen_pmu_amd_ctxt *ctxt; uint64_t *counter_regs; @@ -354,11 +319,12 @@ static unsigned long long xen_amd_read_pmc(int counter) uint8_t xenpmu_flags = get_xenpmu_flags(); if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { - uint32_t msr; - int err; + u32 msr; + u64 val; msr = amd_counters_base + (counter * amd_msr_step); - return native_read_msr_safe(msr, &err); + native_read_msr_safe(msr, &val); + return val; } ctxt = &xenpmu_data->pmu.c.amd; @@ -366,7 +332,7 @@ static unsigned long long xen_amd_read_pmc(int counter) return counter_regs[counter]; } -static unsigned long long xen_intel_read_pmc(int counter) +static u64 xen_intel_read_pmc(int counter) { struct xen_pmu_intel_ctxt *ctxt; uint64_t *fixed_counters; @@ -375,15 +341,16 @@ static unsigned long long xen_intel_read_pmc(int counter) uint8_t xenpmu_flags = get_xenpmu_flags(); if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { - uint32_t msr; - int err; + u32 msr; + u64 val; if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); else msr = MSR_IA32_PERFCTR0 + counter; - return native_read_msr_safe(msr, &err); + native_read_msr_safe(msr, &val); + return val; } ctxt = &xenpmu_data->pmu.c.intel; @@ -396,7 +363,7 @@ static unsigned long long xen_intel_read_pmc(int counter) return arch_cntr_pair[counter].counter; } -unsigned long long xen_read_pmc(int counter) +u64 xen_read_pmc(int counter) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return xen_amd_read_pmc(counter); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c3db71d96c43..3823e52aef52 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -37,9 +37,6 @@ #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Number of pages released from the initial allocation. */ -unsigned long xen_released_pages; - /* Memory map would allow PCI passthrough. */ bool xen_pv_pci_possible; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 688ff59318ae..9bb8ff8bff30 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -305,7 +305,6 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) return rc; xen_pmu_init(cpu); - mc_percpu_init(cpu); /* * Why is this a BUG? If the hypercall fails then everything can be diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 77a6ea1c60e4..ba2f17e64321 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -13,6 +13,7 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> #include <asm/fixmap.h> +#include <asm/msr.h> #include "xen-ops.h" @@ -39,7 +40,7 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); + wrmsrq(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); /* Boot processor notified via generic timekeeping_resume() */ if (smp_processor_id() == 0) @@ -55,9 +56,9 @@ static void xen_vcpu_notify_suspend(void *data) tick_suspend_local(); if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, tmp); + rdmsrq(MSR_IA32_SPEC_CTRL, tmp); this_cpu_write(spec_ctrl, tmp); - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + wrmsrq(MSR_IA32_SPEC_CTRL, 0); } } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 109af12f7647..461bb1526502 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -226,9 +226,7 @@ SYM_CODE_END(xen_early_idt_handler_array) push %rax mov $__HYPERVISOR_iret, %eax syscall /* Do the IRET. */ -#ifdef CONFIG_MITIGATION_SLS - int3 -#endif + ud2 /* The SYSCALL should never return. */ .endm SYM_CODE_START(xen_iret) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 63c13a2ccf55..090349baec09 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -261,9 +261,6 @@ void xen_mc_callback(void (*fn)(void *), void *data); */ struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); -/* Do percpu data initialization for multicalls. */ -void mc_percpu_init(unsigned int cpu); - extern bool is_xen_pmu; irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); @@ -274,10 +271,9 @@ void xen_pmu_finish(int cpu); static inline void xen_pmu_init(int cpu) {} static inline void xen_pmu_finish(int cpu) {} #endif -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read); int pmu_apic_update(uint32_t reg); -unsigned long long xen_read_pmc(int counter); +u64 xen_read_pmc(int counter); #ifdef CONFIG_SMP |