diff options
Diffstat (limited to 'arch/x86')
194 files changed, 4158 insertions, 2585 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 6a1f36df6a18..cf0ad89f5639 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -28,7 +28,7 @@ obj-y += net/ obj-$(CONFIG_KEXEC_FILE) += purgatory/ -obj-y += virt/svm/ +obj-y += virt/ # for cleaning subdir- += boot tools diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7aed87cbf386..40565564a625 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_CONFIGURES_CPU_MITIGATIONS select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 @@ -472,10 +473,6 @@ config X86_MPPARSE For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -config GOLDFISH - def_bool y - depends on X86_GOLDFISH - config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) @@ -504,12 +501,11 @@ config X86_FRED When enabled, try to use Flexible Return and Event Delivery instead of the legacy SYSCALL/SYSENTER/IDT architecture for ring transitions and exception/interrupt handling if the - system supports. + system supports it. -if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" - depends on SMP + depends on SMP && X86_32 help This option is needed for the systems that have more than 8 CPUs. @@ -522,7 +518,10 @@ config X86_EXTENDED_PLATFORM systems out there.) If you enable this option then you'll be able to select support - for the following (non-PC) 32 bit x86 platforms: + for the following non-PC x86 platforms, depending on the value of + CONFIG_64BIT. + + 32-bit platforms (CONFIG_64BIT=n): Goldfish (Android emulator) AMD Elan RDC R-321x SoC @@ -530,28 +529,14 @@ config X86_EXTENDED_PLATFORM STA2X11-based (e.g. Northville) Moorestown MID devices - If you have one of these systems, or if you want to build a - generic distribution kernel, say Y here - otherwise say N. -endif # X86_32 - -if X86_64 -config X86_EXTENDED_PLATFORM - bool "Support for extended (non-PC) x86 platforms" - default y - help - If you disable this option then the kernel will only support - standard PC platforms. (which covers the vast majority of - systems out there.) - - If you enable this option then you'll be able to select support - for the following (non-PC) 64 bit x86 platforms: + 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. -endif # X86_64 + # This is an alphabetically sorted list of 64 bit extended platforms # Please maintain the alphabetic order if and when there are additions config X86_NUMACHIP @@ -2443,6 +2428,8 @@ config USE_X86_SEG_SUPPORT # with named address spaces - see GCC PR sanitizer/111736. # depends on !KASAN + # -fsanitize=thread (KCSAN) is also incompatible. + depends on !KCSAN config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) @@ -2490,17 +2477,21 @@ config PREFIX_SYMBOLS def_bool y depends on CALL_PADDING && !CFI_CLANG -menuconfig SPECULATION_MITIGATIONS - bool "Mitigations for speculative execution vulnerabilities" +menuconfig CPU_MITIGATIONS + bool "Mitigations for CPU vulnerabilities" default y help - Say Y here to enable options which enable mitigations for - speculative execution hardware vulnerabilities. + Say Y here to enable options which enable mitigations for hardware + vulnerabilities (usually related to speculative execution). + Mitigations can be disabled or restricted to SMT systems at runtime + via the "mitigations" kernel parameter. + + If you say N, all mitigations will be disabled. This CANNOT be + overridden at runtime. - If you say N, all mitigations will be disabled. You really - should know what you are doing to say so. + Say 'Y', unless you really know what you are doing. -if SPECULATION_MITIGATIONS +if CPU_MITIGATIONS config MITIGATION_PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" @@ -2635,6 +2626,16 @@ config MITIGATION_RFDS stored in floating point, vector and integer registers. See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst> +config MITIGATION_SPECTRE_BHI + bool "Mitigate Spectre-BHB (Branch History Injection)" + depends on CPU_SUP_INTEL + default y + help + Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks + where the branch history buffer is poisoned to speculatively steer + indirect branches. + See <file:Documentation/admin-guide/hw-vuln/spectre.rst> + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 8ad41da301e5..59aedf32c4ea 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -25,6 +25,16 @@ config AS_GFNI help Supported by binutils >= 2.30 and LLVM integrated assembler +config AS_VAES + def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2) + help + Supported by binutils >= 2.30 and LLVM integrated assembler + +config AS_VPCLMULQDQ + def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2) + help + Supported by binutils >= 2.30 and LLVM integrated assembler + config AS_WRUSS def_bool $(as-instr,wrussq %rax$(comma)(%rbx)) help diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 662d9d4033e6..5ab93fcdd691 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -251,8 +251,6 @@ archheaders: libs-y += arch/x86/lib/ -core-y += arch/x86/virt/ - # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ drivers-$(CONFIG_PCI) += arch/x86/pci/ diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index f4e22ef774ab..876fc6d46a13 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -15,10 +15,12 @@ */ #include <linux/linkage.h> +#include <asm/asm-offsets.h> #include <asm/msr.h> #include <asm/page_types.h> #include <asm/processor-flags.h> #include <asm/segment.h> +#include <asm/setup.h> .code64 .text @@ -49,6 +51,11 @@ SYM_FUNC_START(startup_64_mixed_mode) lea efi32_boot_args(%rip), %rdx mov 0(%rdx), %edi mov 4(%rdx), %esi + + /* Switch to the firmware's stack */ + movl efi32_boot_sp(%rip), %esp + andl $~7, %esp + #ifdef CONFIG_EFI_HANDOVER_PROTOCOL mov 8(%rdx), %edx // saved bootparams pointer test %edx, %edx @@ -144,6 +151,7 @@ SYM_FUNC_END(__efi64_thunk) SYM_FUNC_START(efi32_stub_entry) call 1f 1: popl %ecx + leal (efi32_boot_args - 1b)(%ecx), %ebx /* Clear BSS */ xorl %eax, %eax @@ -158,6 +166,7 @@ SYM_FUNC_START(efi32_stub_entry) popl %ecx popl %edx popl %esi + movl %esi, 8(%ebx) jmp efi32_entry SYM_FUNC_END(efi32_stub_entry) #endif @@ -234,8 +243,6 @@ SYM_FUNC_END(efi_enter32) * * Arguments: %ecx image handle * %edx EFI system table pointer - * %esi struct bootparams pointer (or NULL when not using - * the EFI handover protocol) * * Since this is the point of no return for ordinary execution, no registers * are considered live except for the function parameters. [Note that the EFI @@ -254,13 +261,25 @@ SYM_FUNC_START_LOCAL(efi32_entry) /* Store firmware IDT descriptor */ sidtl (efi32_boot_idt - 1b)(%ebx) + /* Store firmware stack pointer */ + movl %esp, (efi32_boot_sp - 1b)(%ebx) + /* Store boot arguments */ leal (efi32_boot_args - 1b)(%ebx), %ebx movl %ecx, 0(%ebx) movl %edx, 4(%ebx) - movl %esi, 8(%ebx) movb $0x0, 12(%ebx) // efi_is64 + /* + * Allocate some memory for a temporary struct boot_params, which only + * needs the minimal pieces that startup_32() relies on. + */ + subl $PARAM_SIZE, %esp + movl %esp, %esi + movl $PAGE_SIZE, BP_kernel_alignment(%esi) + movl $_end - 1b, BP_init_size(%esi) + subl $startup_32 - 1b, BP_init_size(%esi) + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax @@ -286,8 +305,7 @@ SYM_FUNC_START(efi32_pe_entry) movl 8(%ebp), %ecx // image_handle movl 12(%ebp), %edx // sys_table - xorl %esi, %esi - jmp efi32_entry // pass %ecx, %edx, %esi + jmp efi32_entry // pass %ecx, %edx // no other registers remain live 2: popl %edi // restore callee-save registers @@ -318,5 +336,6 @@ SYM_DATA_END(efi32_boot_idt) SYM_DATA_LOCAL(efi32_boot_cs, .word 0) SYM_DATA_LOCAL(efi32_boot_ds, .word 0) +SYM_DATA_LOCAL(efi32_boot_sp, .long 0) SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index bf4a10a5794f..1dcb794c5479 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -398,6 +398,11 @@ SYM_CODE_START(startup_64) call sev_enable #endif + /* Preserve only the CR4 bits that must be preserved, and clear the rest */ + movq %cr4, %rax + andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax + movq %rax, %cr4 + /* * configure_5level_paging() updates the number of paging levels using * a trampoline in 32-bit addressable memory if the current number does diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index c4ea5258ab55..9049f390d834 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -119,8 +119,8 @@ static void init_heap(void) char *stack_end; if (boot_params.hdr.loadflags & CAN_USE_HEAP) { - asm("leal %P1(%%esp),%0" - : "=r" (stack_end) : "i" (-STACK_SIZE)); + asm("leal %n1(%%esp),%0" + : "=r" (stack_end) : "i" (STACK_SIZE)); heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200); diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d07be9d05cd0..b31ef2424d19 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -3,19 +3,28 @@ * Confidential Computing Platform Capability checks * * Copyright (C) 2021 Advanced Micro Devices, Inc. + * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * Author: Tom Lendacky <thomas.lendacky@amd.com> */ #include <linux/export.h> #include <linux/cc_platform.h> +#include <linux/string.h> +#include <linux/random.h> +#include <asm/archrandom.h> #include <asm/coco.h> #include <asm/processor.h> enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; u64 cc_mask __ro_after_init; +static struct cc_attr_flags { + __u64 host_sev_snp : 1, + __resv : 63; +} cc_flags; + static bool noinstr intel_cc_platform_has(enum cc_attr attr) { switch (attr) { @@ -89,6 +98,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_HOST_SEV_SNP: + return cc_flags.host_sev_snp; + default: return false; } @@ -148,3 +160,84 @@ u64 cc_mkdec(u64 val) } } EXPORT_SYMBOL_GPL(cc_mkdec); + +static void amd_cc_platform_clear(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 0; + break; + default: + break; + } +} + +void cc_platform_clear(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_clear(attr); + break; + default: + break; + } +} + +static void amd_cc_platform_set(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 1; + break; + default: + break; + } +} + +void cc_platform_set(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_set(attr); + break; + default: + break; + } +} + +__init void cc_random_init(void) +{ + /* + * The seed is 32 bytes (in units of longs), which is 256 bits, which + * is the security level that the RNG is targeting. + */ + unsigned long rng_seed[32 / sizeof(long)]; + size_t i, longs; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * Since the CoCo threat model includes the host, the only reliable + * source of entropy that can be neither observed nor manipulated is + * RDRAND. Usually, RDRAND failure is considered tolerable, but since + * CoCo guests have no other unobservable source of entropy, it's + * important to at least ensure the RNG gets some initial random seeds. + */ + for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) { + longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i); + + /* + * A zero return value means that the guest doesn't have RDRAND + * or the CPU is physically broken, and in both cases that + * means most crypto inside of the CoCo instance will be + * broken, defeating the purpose of CoCo in the first place. So + * just panic here because it's absolutely unsafe to continue + * executing. + */ + if (longs == 0) + panic("RDRAND is defective."); + } + add_device_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); +} diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config index 7b497f3b7bc3..de319852a1e9 100644 --- a/arch/x86/configs/hardening.config +++ b/arch/x86/configs/hardening.config @@ -10,5 +10,8 @@ CONFIG_INTEL_IOMMU_DEFAULT_ON=y CONFIG_INTEL_IOMMU_SVM=y CONFIG_AMD_IOMMU=y +# Enforce CET Indirect Branch Tracking in the kernel. +CONFIG_X86_KERNEL_IBT=y + # Enable CET Shadow Stack for userspace. CONFIG_X86_USER_SHADOW_STACK=y diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 66c9e2aab16c..be3ee4294903 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,6 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +# CONFIG_UNWINDER_ORC is not set CONFIG_UNWINDER_GUESS=y # CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9aa46093c91b..9c5ce5613738 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ + aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S new file mode 100644 index 000000000000..48f97b79f7a9 --- /dev/null +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -0,0 +1,845 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * AES-XTS for modern x86_64 CPUs + * + * Copyright 2024 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +/* + * This file implements AES-XTS for modern x86_64 CPUs. To handle the + * complexities of coding for x86 SIMD, e.g. where every vector length needs + * different code, it uses a macro to generate several implementations that + * share similar source code but are targeted at different CPUs, listed below: + * + * AES-NI + AVX + * - 128-bit vectors (1 AES block per vector) + * - VEX-coded instructions + * - xmm0-xmm15 + * - This is for older CPUs that lack VAES but do have AVX. + * + * VAES + VPCLMULQDQ + AVX2 + * - 256-bit vectors (2 AES blocks per vector) + * - VEX-coded instructions + * - ymm0-ymm15 + * - This is for CPUs that have VAES but lack AVX512 or AVX10, + * e.g. Intel's Alder Lake and AMD's Zen 3. + * + * VAES + VPCLMULQDQ + AVX10/256 + BMI2 + * - 256-bit vectors (2 AES blocks per vector) + * - EVEX-coded instructions + * - ymm0-ymm31 + * - This is for CPUs that have AVX512 but where using zmm registers causes + * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. + * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. + * To avoid confusion with 512-bit, we just write AVX10/256. + * + * VAES + VPCLMULQDQ + AVX10/512 + BMI2 + * - Same as the previous one, but upgrades to 512-bit vectors + * (4 AES blocks per vector) in zmm0-zmm31. + * - This is for CPUs that have good AVX512 or AVX10/512 support. + * + * This file doesn't have an implementation for AES-NI alone (without AVX), as + * the lack of VEX would make all the assembly code different. + * + * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of + * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be + * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might + * need to start also providing an implementation using VAES alone. + * + * The AES-XTS implementations in this file support everything required by the + * crypto API, including support for arbitrary input lengths and multi-part + * processing. However, they are most heavily optimized for the common case of + * power-of-2 length inputs that are processed in a single part (disk sectors). + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 +.Lgf_poly: + // The low 64 bits of this value represent the polynomial x^7 + x^2 + x + // + 1. It is the value that must be XOR'd into the low 64 bits of the + // tweak each time a 1 is carried out of the high 64 bits. + // + // The high 64 bits of this value is just the internal carry bit that + // exists when there's a carry out of the low 64 bits of the tweak. + .quad 0x87, 1 + + // This table contains constants for vpshufb and vpblendvb, used to + // handle variable byte shifts and blending during ciphertext stealing + // on CPUs that don't support AVX10-style masking. +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +.text + +// Function parameters +.set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key +.set SRC, %rsi // Pointer to next source data +.set DST, %rdx // Pointer to next destination data +.set LEN, %ecx // Remaining length in bytes +.set LEN8, %cl +.set LEN64, %rcx +.set TWEAK, %r8 // Pointer to next tweak + +// %rax holds the AES key length in bytes. +.set KEYLEN, %eax +.set KEYLEN64, %rax + +// %r9-r11 are available as temporaries. + +.macro _define_Vi i +.if VL == 16 + .set V\i, %xmm\i +.elseif VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported Vector Length (VL)" +.endif +.endm + +.macro _define_aliases + // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers + // are available, that map to the xmm, ymm, or zmm registers according + // to the selected Vector Length (VL). + _define_Vi 0 + _define_Vi 1 + _define_Vi 2 + _define_Vi 3 + _define_Vi 4 + _define_Vi 5 + _define_Vi 6 + _define_Vi 7 + _define_Vi 8 + _define_Vi 9 + _define_Vi 10 + _define_Vi 11 + _define_Vi 12 + _define_Vi 13 + _define_Vi 14 + _define_Vi 15 +.if USE_AVX10 + _define_Vi 16 + _define_Vi 17 + _define_Vi 18 + _define_Vi 19 + _define_Vi 20 + _define_Vi 21 + _define_Vi 22 + _define_Vi 23 + _define_Vi 24 + _define_Vi 25 + _define_Vi 26 + _define_Vi 27 + _define_Vi 28 + _define_Vi 29 + _define_Vi 30 + _define_Vi 31 +.endif + + // V0-V3 hold the data blocks during the main loop, or temporary values + // otherwise. V4-V5 hold temporary values. + + // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. + .set TWEAK0_XMM, %xmm6 + .set TWEAK0, V6 + .set TWEAK1_XMM, %xmm7 + .set TWEAK1, V7 + .set TWEAK2, V8 + .set TWEAK3, V9 + + // V10-V13 are used for computing the next values of TWEAK[0-3]. + .set NEXT_TWEAK0, V10 + .set NEXT_TWEAK1, V11 + .set NEXT_TWEAK2, V12 + .set NEXT_TWEAK3, V13 + + // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. + .set GF_POLY_XMM, %xmm14 + .set GF_POLY, V14 + + // V15 holds the key for AES "round 0", copied to all 128-bit lanes. + .set KEY0_XMM, %xmm15 + .set KEY0, V15 + + // If 32 SIMD registers are available, then V16-V29 hold the remaining + // AES round keys, copied to all 128-bit lanes. + // + // AES-128, AES-192, and AES-256 use different numbers of round keys. + // To allow handling all three variants efficiently, we align the round + // keys to the *end* of this register range. I.e., AES-128 uses + // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. + // (All also use KEY0 for the XOR-only "round" at the beginning.) +.if USE_AVX10 + .set KEY1_XMM, %xmm16 + .set KEY1, V16 + .set KEY2_XMM, %xmm17 + .set KEY2, V17 + .set KEY3_XMM, %xmm18 + .set KEY3, V18 + .set KEY4_XMM, %xmm19 + .set KEY4, V19 + .set KEY5_XMM, %xmm20 + .set KEY5, V20 + .set KEY6_XMM, %xmm21 + .set KEY6, V21 + .set KEY7_XMM, %xmm22 + .set KEY7, V22 + .set KEY8_XMM, %xmm23 + .set KEY8, V23 + .set KEY9_XMM, %xmm24 + .set KEY9, V24 + .set KEY10_XMM, %xmm25 + .set KEY10, V25 + .set KEY11_XMM, %xmm26 + .set KEY11, V26 + .set KEY12_XMM, %xmm27 + .set KEY12, V27 + .set KEY13_XMM, %xmm28 + .set KEY13, V28 + .set KEY14_XMM, %xmm29 + .set KEY14, V29 +.endif + // V30-V31 are currently unused. +.endm + +// Move a vector between memory and a register. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value into a vector. +.macro _vbroadcast128 src, dst +.if VL == 16 && !USE_AVX10 + vmovdqu \src, \dst +.elseif VL == 32 && !USE_AVX10 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +.macro _vpxor src1, src2, dst +.if USE_AVX10 + vpxord \src1, \src2, \dst +.else + vpxor \src1, \src2, \dst +.endif +.endm + +// XOR three vectors together. +.macro _xor3 src1, src2, src3_and_dst +.if USE_AVX10 + // vpternlogd with immediate 0x96 is a three-argument XOR. + vpternlogd $0x96, \src1, \src2, \src3_and_dst +.else + vpxor \src1, \src3_and_dst, \src3_and_dst + vpxor \src2, \src3_and_dst, \src3_and_dst +.endif +.endm + +// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak +// (by multiplying by the polynomial 'x') and write it to \dst. +.macro _next_tweak src, tmp, dst + vpshufd $0x13, \src, \tmp + vpaddq \src, \src, \dst + vpsrad $31, \tmp, \tmp + vpand GF_POLY_XMM, \tmp, \tmp + vpxor \tmp, \dst, \dst +.endm + +// Given the XTS tweak(s) in the vector \src, compute the next vector of +// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. +// +// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute +// all tweaks in the vector in parallel. If VL=16, we just do the regular +// computation without vpclmulqdq, as it's the faster method for a single tweak. +.macro _next_tweakvec src, tmp1, tmp2, dst +.if VL == 16 + _next_tweak \src, \tmp1, \dst +.else + vpsrlq $64 - VL/16, \src, \tmp1 + vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 + vpslldq $8, \tmp1, \tmp1 + vpsllq $VL/16, \src, \dst + _xor3 \tmp1, \tmp2, \dst +.endif +.endm + +// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and +// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. +.macro _compute_first_set_of_tweaks + vmovdqu (TWEAK), TWEAK0_XMM + _vbroadcast128 .Lgf_poly(%rip), GF_POLY +.if VL == 16 + // With VL=16, multiplying by x serially is fastest. + _next_tweak TWEAK0, %xmm0, TWEAK1 + _next_tweak TWEAK1, %xmm0, TWEAK2 + _next_tweak TWEAK2, %xmm0, TWEAK3 +.else +.if VL == 32 + // Compute the second block of TWEAK0. + _next_tweak TWEAK0_XMM, %xmm0, %xmm1 + vinserti128 $1, %xmm1, TWEAK0, TWEAK0 +.elseif VL == 64 + // Compute the remaining blocks of TWEAK0. + _next_tweak TWEAK0_XMM, %xmm0, %xmm1 + _next_tweak %xmm1, %xmm0, %xmm2 + _next_tweak %xmm2, %xmm0, %xmm3 + vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 + vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 + vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 +.endif + // Compute TWEAK[1-3] from TWEAK0. + vpsrlq $64 - 1*VL/16, TWEAK0, V0 + vpsrlq $64 - 2*VL/16, TWEAK0, V2 + vpsrlq $64 - 3*VL/16, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V2, V2 + vpslldq $8, V4, V4 + vpsllq $1*VL/16, TWEAK0, TWEAK1 + vpsllq $2*VL/16, TWEAK0, TWEAK2 + vpsllq $3*VL/16, TWEAK0, TWEAK3 +.if USE_AVX10 + vpternlogd $0x96, V0, V1, TWEAK1 + vpternlogd $0x96, V2, V3, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 +.else + vpxor V0, TWEAK1, TWEAK1 + vpxor V2, TWEAK2, TWEAK2 + vpxor V4, TWEAK3, TWEAK3 + vpxor V1, TWEAK1, TWEAK1 + vpxor V3, TWEAK2, TWEAK2 + vpxor V5, TWEAK3, TWEAK3 +.endif +.endif +.endm + +// Do one step in computing the next set of tweaks using the method of just +// multiplying by x repeatedly (the same method _next_tweak uses). +.macro _tweak_step_mulx i +.if \i == 0 + .set PREV_TWEAK, TWEAK3 + .set NEXT_TWEAK, NEXT_TWEAK0 +.elseif \i == 5 + .set PREV_TWEAK, NEXT_TWEAK0 + .set NEXT_TWEAK, NEXT_TWEAK1 +.elseif \i == 10 + .set PREV_TWEAK, NEXT_TWEAK1 + .set NEXT_TWEAK, NEXT_TWEAK2 +.elseif \i == 15 + .set PREV_TWEAK, NEXT_TWEAK2 + .set NEXT_TWEAK, NEXT_TWEAK3 +.endif +.if \i >= 0 && \i < 20 && \i % 5 == 0 + vpshufd $0x13, PREV_TWEAK, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 1 + vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK +.elseif \i >= 0 && \i < 20 && \i % 5 == 2 + vpsrad $31, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 3 + vpand GF_POLY, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 4 + vpxor V5, NEXT_TWEAK, NEXT_TWEAK +.elseif \i == 1000 + vmovdqa NEXT_TWEAK0, TWEAK0 + vmovdqa NEXT_TWEAK1, TWEAK1 + vmovdqa NEXT_TWEAK2, TWEAK2 + vmovdqa NEXT_TWEAK3, TWEAK3 +.endif +.endm + +// Do one step in computing the next set of tweaks using the VPCLMULQDQ method +// (the same method _next_tweakvec uses for VL > 16). This means multiplying +// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 +// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, +// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. +.macro _tweak_step_pclmul i +.if \i == 0 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 +.elseif \i == 2 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 +.elseif \i == 4 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 +.elseif \i == 6 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 +.elseif \i == 8 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 +.elseif \i == 10 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 +.elseif \i == 12 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 +.elseif \i == 14 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 +.elseif \i == 1000 + vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 + vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 + vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 + vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 + _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 + _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 + _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 + _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 +.endif +.endm + +// _tweak_step does one step of the computation of the next set of tweaks from +// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of +// \i that include at least 0 through 19, then 1000 which signals the last step. +// +// This is used to interleave the computation of the next set of tweaks with the +// AES en/decryptions, which increases performance in some cases. +.macro _tweak_step i +.if VL == 16 + _tweak_step_mulx \i +.else + _tweak_step_pclmul \i +.endif +.endm + +.macro _setup_round_keys enc + + // Select either the encryption round keys or the decryption round keys. +.if \enc + .set OFFS, 0 +.else + .set OFFS, 240 +.endif + + // Load the round key for "round 0". + _vbroadcast128 OFFS(KEY), KEY0 + + // Increment KEY to make it so that 7*16(KEY) is the last round key. + // For AES-128, increment by 3*16, resulting in the 10 round keys (not + // counting the zero-th round key which was just loaded into KEY0) being + // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use + // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment + // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). + // + // This rebasing provides two benefits. First, it makes the offset to + // any round key be in the range [-96, 112], fitting in a signed byte. + // This shortens VEX-encoded instructions that access the later round + // keys which otherwise would need 4-byte offsets. Second, it makes it + // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the + // beginning. Skipping rounds at the end doesn't work as well because + // the last round needs different instructions. + // + // An alternative approach would be to roll up all the round loops. We + // don't do that because it isn't compatible with caching the round keys + // in registers which we do when possible (see below), and also because + // it seems unwise to rely *too* heavily on the CPU's branch predictor. + lea OFFS-16(KEY, KEYLEN64, 4), KEY + + // If all 32 SIMD registers are available, cache all the round keys. +.if USE_AVX10 + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + _vbroadcast128 -6*16(KEY), KEY1 + _vbroadcast128 -5*16(KEY), KEY2 +.Laes192\@: + _vbroadcast128 -4*16(KEY), KEY3 + _vbroadcast128 -3*16(KEY), KEY4 +.Laes128\@: + _vbroadcast128 -2*16(KEY), KEY5 + _vbroadcast128 -1*16(KEY), KEY6 + _vbroadcast128 0*16(KEY), KEY7 + _vbroadcast128 1*16(KEY), KEY8 + _vbroadcast128 2*16(KEY), KEY9 + _vbroadcast128 3*16(KEY), KEY10 + _vbroadcast128 4*16(KEY), KEY11 + _vbroadcast128 5*16(KEY), KEY12 + _vbroadcast128 6*16(KEY), KEY13 + _vbroadcast128 7*16(KEY), KEY14 +.endif +.endm + +// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) +// on the block(s) in \data using the round key(s) in \key. The register length +// determines the number of AES blocks en/decrypted. +.macro _vaes enc, last, key, data +.if \enc +.if \last + vaesenclast \key, \data, \data +.else + vaesenc \key, \data, \data +.endif +.else +.if \last + vaesdeclast \key, \data, \data +.else + vaesdec \key, \data, \data +.endif +.endif +.endm + +// Do a single round of AES en/decryption on the block(s) in \data, using the +// same key for all block(s). The round key is loaded from the appropriate +// register or memory location for round \i. May clobber V4. +.macro _vaes_1x enc, last, i, xmm_suffix, data +.if USE_AVX10 + _vaes \enc, \last, KEY\i\xmm_suffix, \data +.else +.ifnb \xmm_suffix + _vaes \enc, \last, (\i-7)*16(KEY), \data +.else + _vbroadcast128 (\i-7)*16(KEY), V4 + _vaes \enc, \last, V4, \data +.endif +.endif +.endm + +// Do a single round of AES en/decryption on the blocks in registers V0-V3, +// using the same key for all blocks. The round key is loaded from the +// appropriate register or memory location for round \i. In addition, does two +// steps of the computation of the next set of tweaks. May clobber V4. +.macro _vaes_4x enc, last, i +.if USE_AVX10 + _tweak_step (2*(\i-5)) + _vaes \enc, \last, KEY\i, V0 + _vaes \enc, \last, KEY\i, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, \last, KEY\i, V2 + _vaes \enc, \last, KEY\i, V3 +.else + _vbroadcast128 (\i-7)*16(KEY), V4 + _tweak_step (2*(\i-5)) + _vaes \enc, \last, V4, V0 + _vaes \enc, \last, V4, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, \last, V4, V2 + _vaes \enc, \last, V4, V3 +.endif +.endm + +// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, +// then XOR with \tweak again) of the block(s) in \data. To process a single +// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of +// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. +.macro _aes_crypt enc, xmm_suffix, tweak, data + _xor3 KEY0\xmm_suffix, \tweak, \data + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + _vaes_1x \enc, 0, 1, \xmm_suffix, \data + _vaes_1x \enc, 0, 2, \xmm_suffix, \data +.Laes192\@: + _vaes_1x \enc, 0, 3, \xmm_suffix, \data + _vaes_1x \enc, 0, 4, \xmm_suffix, \data +.Laes128\@: + _vaes_1x \enc, 0, 5, \xmm_suffix, \data + _vaes_1x \enc, 0, 6, \xmm_suffix, \data + _vaes_1x \enc, 0, 7, \xmm_suffix, \data + _vaes_1x \enc, 0, 8, \xmm_suffix, \data + _vaes_1x \enc, 0, 9, \xmm_suffix, \data + _vaes_1x \enc, 0, 10, \xmm_suffix, \data + _vaes_1x \enc, 0, 11, \xmm_suffix, \data + _vaes_1x \enc, 0, 12, \xmm_suffix, \data + _vaes_1x \enc, 0, 13, \xmm_suffix, \data + _vaes_1x \enc, 1, 14, \xmm_suffix, \data + _vpxor \tweak, \data, \data +.endm + +.macro _aes_xts_crypt enc + _define_aliases + +.if !\enc + // When decrypting a message whose length isn't a multiple of the AES + // block length, exclude the last full block from the main loop by + // subtracting 16 from LEN. This is needed because ciphertext stealing + // decryption uses the last two tweaks in reverse order. We'll handle + // the last full block and the partial block specially at the end. + lea -16(LEN), %eax + test $15, LEN8 + cmovnz %eax, LEN +.endif + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), KEYLEN + + // Setup the pointer to the round keys and cache as many as possible. + _setup_round_keys \enc + + // Compute the first set of tweaks TWEAK[0-3]. + _compute_first_set_of_tweaks + + sub $4*VL, LEN + jl .Lhandle_remainder\@ + +.Lmain_loop\@: + // This is the main loop, en/decrypting 4*VL bytes per iteration. + + // XOR each source block with its tweak and the zero-th round key. +.if USE_AVX10 + vmovdqu8 0*VL(SRC), V0 + vmovdqu8 1*VL(SRC), V1 + vmovdqu8 2*VL(SRC), V2 + vmovdqu8 3*VL(SRC), V3 + vpternlogd $0x96, TWEAK0, KEY0, V0 + vpternlogd $0x96, TWEAK1, KEY0, V1 + vpternlogd $0x96, TWEAK2, KEY0, V2 + vpternlogd $0x96, TWEAK3, KEY0, V3 +.else + vpxor 0*VL(SRC), KEY0, V0 + vpxor 1*VL(SRC), KEY0, V1 + vpxor 2*VL(SRC), KEY0, V2 + vpxor 3*VL(SRC), KEY0, V3 + vpxor TWEAK0, V0, V0 + vpxor TWEAK1, V1, V1 + vpxor TWEAK2, V2, V2 + vpxor TWEAK3, V3, V3 +.endif + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + // Do all the AES rounds on the data blocks, interleaved with + // the computation of the next set of tweaks. + _vaes_4x \enc, 0, 1 + _vaes_4x \enc, 0, 2 +.Laes192\@: + _vaes_4x \enc, 0, 3 + _vaes_4x \enc, 0, 4 +.Laes128\@: + _vaes_4x \enc, 0, 5 + _vaes_4x \enc, 0, 6 + _vaes_4x \enc, 0, 7 + _vaes_4x \enc, 0, 8 + _vaes_4x \enc, 0, 9 + _vaes_4x \enc, 0, 10 + _vaes_4x \enc, 0, 11 + _vaes_4x \enc, 0, 12 + _vaes_4x \enc, 0, 13 + _vaes_4x \enc, 1, 14 + + // XOR in the tweaks again. + _vpxor TWEAK0, V0, V0 + _vpxor TWEAK1, V1, V1 + _vpxor TWEAK2, V2, V2 + _vpxor TWEAK3, V3, V3 + + // Store the destination blocks. + _vmovdqu V0, 0*VL(DST) + _vmovdqu V1, 1*VL(DST) + _vmovdqu V2, 2*VL(DST) + _vmovdqu V3, 3*VL(DST) + + // Finish computing the next set of tweaks. + _tweak_step 1000 + + add $4*VL, SRC + add $4*VL, DST + sub $4*VL, LEN + jge .Lmain_loop\@ + + // Check for the uncommon case where the data length isn't a multiple of + // 4*VL. Handle it out-of-line in order to optimize for the common + // case. In the common case, just fall through to the ret. + test $4*VL-1, LEN8 + jnz .Lhandle_remainder\@ +.Ldone\@: + // Store the next tweak back to *TWEAK to support continuation calls. + vmovdqu TWEAK0_XMM, (TWEAK) +.if VL > 16 + vzeroupper +.endif + RET + +.Lhandle_remainder\@: + + // En/decrypt any remaining full blocks, one vector at a time. +.if VL > 16 + add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. + jl .Lvec_at_a_time_done\@ +.Lvec_at_a_time\@: + _vmovdqu (SRC), V0 + _aes_crypt \enc, , TWEAK0, V0 + _vmovdqu V0, (DST) + _next_tweakvec TWEAK0, V0, V1, TWEAK0 + add $VL, SRC + add $VL, DST + sub $VL, LEN + jge .Lvec_at_a_time\@ +.Lvec_at_a_time_done\@: + add $VL-16, LEN // Undo extra sub of VL, then sub 16. +.else + add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. +.endif + + // En/decrypt any remaining full blocks, one at a time. + jl .Lblock_at_a_time_done\@ +.Lblock_at_a_time\@: + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + vmovdqu %xmm0, (DST) + _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM + add $16, SRC + add $16, DST + sub $16, LEN + jge .Lblock_at_a_time\@ +.Lblock_at_a_time_done\@: + add $16, LEN // Undo the extra sub of 16. + // Now 0 <= LEN <= 15. If LEN is zero, we're done. + jz .Ldone\@ + + // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. + // Do ciphertext stealing to process the last 16 + LEN bytes. + +.if \enc + // If encrypting, the main loop already encrypted the last full block to + // create the CTS intermediate ciphertext. Prepare for the rest of CTS + // by rewinding the pointers and loading the intermediate ciphertext. + sub $16, SRC + sub $16, DST + vmovdqu (DST), %xmm0 +.else + // If decrypting, the main loop didn't decrypt the last full block + // because CTS decryption uses the last two tweaks in reverse order. + // Do it now by advancing the tweak and decrypting the last full block. + _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 +.endif + +.if USE_AVX10 + // Create a mask that has the first LEN bits set. + mov $-1, %r9d + bzhi LEN, %r9d, %r9d + kmovd %r9d, %k1 + + // Swap the first LEN bytes of the en/decryption of the last full block + // with the partial block. Note that to support in-place en/decryption, + // the load from the src partial block must happen before the store to + // the dst partial block. + vmovdqa %xmm0, %xmm1 + vmovdqu8 16(SRC), %xmm0{%k1} + vmovdqu8 %xmm1, 16(DST){%k1} +.else + lea .Lcts_permute_table(%rip), %r9 + + // Load the src partial block, left-aligned. Note that to support + // in-place en/decryption, this must happen before the store to the dst + // partial block. + vmovdqu (SRC, LEN64, 1), %xmm1 + + // Shift the first LEN bytes of the en/decryption of the last full block + // to the end of a register, then store it to DST+LEN. This stores the + // dst partial block. It also writes to the second part of the dst last + // full block, but that part is overwritten later. + vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 + vmovdqu %xmm2, (DST, LEN64, 1) + + // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. + sub LEN64, %r9 + vmovdqu 32(%r9), %xmm3 + + // Shift the src partial block to the beginning of its register. + vpshufb %xmm3, %xmm1, %xmm1 + + // Do a blend to generate the src partial block followed by the second + // part of the en/decryption of the last full block. + vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 +.endif + // En/decrypt again and store the last full block. + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + vmovdqu %xmm0, (DST) + jmp .Ldone\@ +.endm + +// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, +// u8 iv[AES_BLOCK_SIZE]); +SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) + vmovdqu (%rsi), %xmm0 + vpxor (%rdi), %xmm0, %xmm0 + movl 480(%rdi), %eax // AES key length + lea -16(%rdi, %rax, 4), %rdi + cmp $24, %eax + jl .Lencrypt_iv_aes128 + je .Lencrypt_iv_aes192 + vaesenc -6*16(%rdi), %xmm0, %xmm0 + vaesenc -5*16(%rdi), %xmm0, %xmm0 +.Lencrypt_iv_aes192: + vaesenc -4*16(%rdi), %xmm0, %xmm0 + vaesenc -3*16(%rdi), %xmm0, %xmm0 +.Lencrypt_iv_aes128: + vaesenc -2*16(%rdi), %xmm0, %xmm0 + vaesenc -1*16(%rdi), %xmm0, %xmm0 + vaesenc 0*16(%rdi), %xmm0, %xmm0 + vaesenc 1*16(%rdi), %xmm0, %xmm0 + vaesenc 2*16(%rdi), %xmm0, %xmm0 + vaesenc 3*16(%rdi), %xmm0, %xmm0 + vaesenc 4*16(%rdi), %xmm0, %xmm0 + vaesenc 5*16(%rdi), %xmm0, %xmm0 + vaesenc 6*16(%rdi), %xmm0, %xmm0 + vaesenclast 7*16(%rdi), %xmm0, %xmm0 + vmovdqu %xmm0, (%rsi) + RET +SYM_FUNC_END(aes_xts_encrypt_iv) + +// Below are the actual AES-XTS encryption and decryption functions, +// instantiated from the above macro. They all have the following prototype: +// +// void (*xts_asm_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, unsigned int len, +// u8 tweak[AES_BLOCK_SIZE]); +// +// |key| is the data key. |tweak| contains the next tweak; the encryption of +// the original IV with the tweak key was already done. This function supports +// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and +// |len| must be a multiple of 16 except on the last call. If |len| is a +// multiple of 16, then this function updates |tweak| to contain the next tweak. + +.set VL, 16 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) + +.set VL, 32 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) + +.set VL, 64 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 411d8c83e88a..39066b57a70e 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -83,9 +83,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff .text - -#define STACK_OFFSET 8*3 - #define AadHash 16*0 #define AadLen 16*1 #define InLen (16*1)+8 @@ -116,11 +113,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define arg4 rcx #define arg5 r8 #define arg6 r9 -#define arg7 STACK_OFFSET+8(%rsp) -#define arg8 STACK_OFFSET+16(%rsp) -#define arg9 STACK_OFFSET+24(%rsp) -#define arg10 STACK_OFFSET+32(%rsp) -#define arg11 STACK_OFFSET+40(%rsp) #define keysize 2*15*16(%arg1) #endif @@ -1507,184 +1499,6 @@ _esb_loop_\@: MOVADQ (%r10),\TMP1 aesenclast \TMP1,\XMM0 .endm -/***************************************************************************** -* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Plaintext output. Encrypt in-place is allowed. -* const u8 *in, // Ciphertext input -* u64 plaintext_len, // Length of data in bytes for decryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the -* // given authentication tag and only return the plaintext if they match. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 -* // (most likely), 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the first -* set of 11 keys in the data structure void *aes_ctx -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -* -*****************************************************************************/ -SYM_FUNC_START(aesni_gcm_dec) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC dec - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec) - - -/***************************************************************************** -* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the -* first set of 11 keys in the data structure void *aes_ctx -* -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -***************************************************************************/ -SYM_FUNC_START(aesni_gcm_enc) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC enc - - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc) /***************************************************************************** * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1820,8 +1634,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) SYM_FUNC_END(_key_expansion_256b) /* - * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - * unsigned int key_len) + * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) */ SYM_FUNC_START(aesni_set_key) FRAME_BEGIN @@ -1926,7 +1740,6 @@ SYM_FUNC_START(aesni_set_key) sub $0x10, UKEYP cmp TKEYP, KEYP jb .Ldec_key_loop - xor AREG, AREG #ifndef __x86_64__ popl KEYP #endif @@ -2826,28 +2639,24 @@ SYM_FUNC_END(aesni_ctr_enc) .previous /* - * _aesni_gf128mul_x_ble: internal ABI - * Multiply in GF(2^128) for XTS IVs + * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs * input: * IV: current IV * GF128MUL_MASK == mask with 0x87 and 0x01 * output: * IV: next IV * changed: - * CTR: == temporary value + * KEY: == temporary value */ -#define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, KEY; \ - paddq IV, IV; \ - psrad $31, KEY; \ - pand GF128MUL_MASK, KEY; \ - pxor KEY, IV; +.macro _aesni_gf128mul_x_ble + pshufd $0x13, IV, KEY + paddq IV, IV + psrad $31, KEY + pand GF128MUL_MASK, KEY + pxor KEY, IV +.endm -/* - * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_encrypt) +.macro _aesni_xts_crypt enc FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2866,35 +2675,46 @@ SYM_FUNC_START(aesni_xts_encrypt) movups (IVP), IV mov 480(KEYP), KLEN +.if !\enc + add $240, KEYP -.Lxts_enc_loop4: + test $15, LEN + jz .Lxts_loop4\@ + sub $16, LEN +.endif + +.Lxts_loop4\@: sub $64, LEN - jl .Lxts_enc_1x + jl .Lxts_1x\@ movdqa IV, STATE1 movdqu 0x00(INP), IN pxor IN, STATE1 movdqu IV, 0x00(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE2 movdqu 0x10(INP), IN pxor IN, STATE2 movdqu IV, 0x10(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE3 movdqu 0x20(INP), IN pxor IN, STATE3 movdqu IV, 0x20(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE4 movdqu 0x30(INP), IN pxor IN, STATE4 movdqu IV, 0x30(OUTP) +.if \enc call _aesni_enc4 +.else + call _aesni_dec4 +.endif movdqu 0x00(OUTP), IN pxor IN, STATE1 @@ -2912,17 +2732,17 @@ SYM_FUNC_START(aesni_xts_encrypt) pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble add $64, INP add $64, OUTP test LEN, LEN - jnz .Lxts_enc_loop4 + jnz .Lxts_loop4\@ -.Lxts_enc_ret_iv: +.Lxts_ret_iv\@: movups IV, (IVP) -.Lxts_enc_ret: +.Lxts_ret\@: #ifndef __x86_64__ popl KLEN popl KEYP @@ -2932,201 +2752,60 @@ SYM_FUNC_START(aesni_xts_encrypt) FRAME_END RET -.Lxts_enc_1x: +.Lxts_1x\@: add $64, LEN - jz .Lxts_enc_ret_iv + jz .Lxts_ret_iv\@ +.if \enc sub $16, LEN - jl .Lxts_enc_cts4 + jl .Lxts_cts4\@ +.endif -.Lxts_enc_loop1: +.Lxts_loop1\@: movdqu (INP), STATE +.if \enc pxor IV, STATE call _aesni_enc1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_enc_out - +.else add $16, INP sub $16, LEN - jl .Lxts_enc_cts1 - - movdqu STATE, (OUTP) - add $16, OUTP - jmp .Lxts_enc_loop1 - -.Lxts_enc_out: - movdqu STATE, (OUTP) - jmp .Lxts_enc_ret_iv - -.Lxts_enc_cts4: - movdqa STATE4, STATE - sub $16, OUTP - -.Lxts_enc_cts1: -#ifndef __x86_64__ - lea .Lcts_permute_table, T1 -#else - lea .Lcts_permute_table(%rip), T1 -#endif - add LEN, INP /* rewind input pointer */ - add $16, LEN /* # bytes in final block */ - movups (INP), IN1 - - mov T1, IVP - add $32, IVP - add LEN, T1 - sub LEN, IVP - add OUTP, LEN - - movups (T1), %xmm4 - movaps STATE, IN2 - pshufb %xmm4, STATE - movups STATE, (LEN) - - movups (IVP), %xmm0 - pshufb %xmm0, IN1 - pblendvb IN2, IN1 - movaps IN1, STATE - + jl .Lxts_cts1\@ pxor IV, STATE - call _aesni_enc1 + call _aesni_dec1 +.endif pxor IV, STATE + _aesni_gf128mul_x_ble - movups STATE, (OUTP) - jmp .Lxts_enc_ret -SYM_FUNC_END(aesni_xts_encrypt) - -/* - * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_decrypt) - FRAME_BEGIN -#ifndef __x86_64__ - pushl IVP - pushl LEN - pushl KEYP - pushl KLEN - movl (FRAME_OFFSET+20)(%esp), KEYP # ctx - movl (FRAME_OFFSET+24)(%esp), OUTP # dst - movl (FRAME_OFFSET+28)(%esp), INP # src - movl (FRAME_OFFSET+32)(%esp), LEN # len - movl (FRAME_OFFSET+36)(%esp), IVP # iv - movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK -#else - movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK -#endif - movups (IVP), IV - - mov 480(KEYP), KLEN - add $240, KEYP - - test $15, LEN - jz .Lxts_dec_loop4 - sub $16, LEN - -.Lxts_dec_loop4: - sub $64, LEN - jl .Lxts_dec_1x - - movdqa IV, STATE1 - movdqu 0x00(INP), IN - pxor IN, STATE1 - movdqu IV, 0x00(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x10(INP), IN - pxor IN, STATE2 - movdqu IV, 0x10(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x20(INP), IN - pxor IN, STATE3 - movdqu IV, 0x20(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE4 - movdqu 0x30(INP), IN - pxor IN, STATE4 - movdqu IV, 0x30(OUTP) - - call _aesni_dec4 - - movdqu 0x00(OUTP), IN - pxor IN, STATE1 - movdqu STATE1, 0x00(OUTP) - - movdqu 0x10(OUTP), IN - pxor IN, STATE2 - movdqu STATE2, 0x10(OUTP) - - movdqu 0x20(OUTP), IN - pxor IN, STATE3 - movdqu STATE3, 0x20(OUTP) - - movdqu 0x30(OUTP), IN - pxor IN, STATE4 - movdqu STATE4, 0x30(OUTP) - - _aesni_gf128mul_x_ble() - - add $64, INP - add $64, OUTP test LEN, LEN - jnz .Lxts_dec_loop4 - -.Lxts_dec_ret_iv: - movups IV, (IVP) - -.Lxts_dec_ret: -#ifndef __x86_64__ - popl KLEN - popl KEYP - popl LEN - popl IVP -#endif - FRAME_END - RET - -.Lxts_dec_1x: - add $64, LEN - jz .Lxts_dec_ret_iv - -.Lxts_dec_loop1: - movdqu (INP), STATE + jz .Lxts_out\@ +.if \enc add $16, INP sub $16, LEN - jl .Lxts_dec_cts1 - - pxor IV, STATE - call _aesni_dec1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_dec_out + jl .Lxts_cts1\@ +.endif movdqu STATE, (OUTP) add $16, OUTP - jmp .Lxts_dec_loop1 + jmp .Lxts_loop1\@ -.Lxts_dec_out: +.Lxts_out\@: movdqu STATE, (OUTP) - jmp .Lxts_dec_ret_iv + jmp .Lxts_ret_iv\@ -.Lxts_dec_cts1: +.if \enc +.Lxts_cts4\@: + movdqa STATE4, STATE + sub $16, OUTP +.Lxts_cts1\@: +.else +.Lxts_cts1\@: movdqa IV, STATE4 - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble pxor IV, STATE call _aesni_dec1 pxor IV, STATE - +.endif #ifndef __x86_64__ lea .Lcts_permute_table, T1 #else @@ -3152,10 +2831,32 @@ SYM_FUNC_START(aesni_xts_decrypt) pblendvb IN2, IN1 movaps IN1, STATE +.if \enc + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE +.else pxor STATE4, STATE call _aesni_dec1 pxor STATE4, STATE +.endif movups STATE, (OUTP) - jmp .Lxts_dec_ret -SYM_FUNC_END(aesni_xts_decrypt) + jmp .Lxts_ret\@ +.endm + +/* + * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_enc) + _aesni_xts_crypt 1 +SYM_FUNC_END(aesni_xts_enc) + +/* + * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_dec) + _aesni_xts_crypt 0 +SYM_FUNC_END(aesni_xts_dec) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index b1d90c25975a..5b25d2a58aeb 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -40,7 +40,6 @@ #define AESNI_ALIGN 16 #define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) -#define RFC4106_HASH_SUBKEY_SIZE 16 #define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) @@ -87,8 +86,8 @@ static inline void *aes_align_addr(void *addr) return PTR_ALIGN(addr, AESNI_ALIGN); } -asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len); +asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -107,11 +106,11 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, #define AVX_GEN2_OPTSIZE 640 #define AVX_GEN4_OPTSIZE 4096 -asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 @@ -233,19 +232,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx, { int err; - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) - return -EINVAL; - if (!crypto_simd_usable()) - err = aes_expandkey(ctx, in_key, key_len); - else { - kernel_fpu_begin(); - err = aesni_set_key(ctx, in_key, key_len); - kernel_fpu_end(); - } + return aes_expandkey(ctx, in_key, key_len); - return err; + err = aes_check_keylen(key_len); + if (err) + return err; + + kernel_fpu_begin(); + aesni_set_key(ctx, in_key, key_len); + kernel_fpu_end(); + return 0; } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -592,23 +589,12 @@ static int xctr_crypt(struct skcipher_request *req) return err; } -static int -rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) +static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key, + u8 hash_subkey[AES_BLOCK_SIZE]) { - struct crypto_aes_ctx ctx; - int ret; - - ret = aes_expandkey(&ctx, key, key_len); - if (ret) - return ret; - - /* Clear the data in the hash sub key container to zero.*/ - /* We want to cipher all zeros to create the hash sub key. */ - memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - - aes_encrypt(&ctx, hash_subkey, hash_subkey); + static const u8 zeroes[AES_BLOCK_SIZE]; - memzero_explicit(&ctx, sizeof(ctx)); + aes_encrypt(aes_key, hash_subkey, zeroes); return 0; } @@ -626,7 +612,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); + aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, + ctx->hash_subkey); } /* This is the Integrity Check Value (aka the authentication tag) length and can @@ -877,7 +864,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, +static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); @@ -898,108 +885,149 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } -static int xts_crypt(struct skcipher_request *req, bool encrypt) +typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); +typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, unsigned int len, + u8 tweak[AES_BLOCK_SIZE]); + +/* This handles cases where the source and/or destination span pages. */ +static noinline int +xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; + struct scatterlist *src, *dst; int err; - if (req->cryptlen < AES_BLOCK_SIZE) - return -EINVAL; - - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - - if (unlikely(tail > 0 && walk.nbytes < walk.total)) { - int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; - - skcipher_walk_abort(&walk); - + /* + * If the message length isn't divisible by the AES block size, then + * separate off the last full block and the partial block. This ensures + * that they are processed in the same call to the assembly function, + * which is required for ciphertext stealing. + */ + if (tail) { skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); skcipher_request_set_crypt(&subreq, req->src, req->dst, - blocks * AES_BLOCK_SIZE, req->iv); + req->cryptlen - tail - AES_BLOCK_SIZE, + req->iv); req = &subreq; + } - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - } else { - tail = 0; + err = skcipher_walk_virt(&walk, req, false); + + while (walk.nbytes) { + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, + walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv); + kernel_fpu_end(); + err = skcipher_walk_done(&walk, + walk.nbytes & (AES_BLOCK_SIZE - 1)); } - kernel_fpu_begin(); + if (err || !tail) + return err; - /* calculate first value of T */ - aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv); + /* Do ciphertext stealing with the last full block and partial block. */ - while (walk.nbytes > 0) { - int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes &= ~(AES_BLOCK_SIZE - 1); - - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - kernel_fpu_end(); + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - if (walk.nbytes > 0) - kernel_fpu_begin(); - } + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; - if (unlikely(tail > 0 && !err)) { - struct scatterlist sg_src[2], sg_dst[2]; - struct scatterlist *src, *dst; + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, req->iv); + kernel_fpu_end(); - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + return skcipher_walk_done(&walk, 0); +} - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); +/* __always_inline to avoid indirect call in fastpath */ +static __always_inline int +xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, + xts_crypt_func crypt_func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); + const unsigned int cryptlen = req->cryptlen; + struct scatterlist *src = req->src; + struct scatterlist *dst = req->dst; - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; + if (unlikely(cryptlen < AES_BLOCK_SIZE)) + return -EINVAL; - kernel_fpu_begin(); - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - kernel_fpu_end(); + kernel_fpu_begin(); + (*encrypt_iv)(&ctx->tweak_ctx, req->iv); - err = skcipher_walk_done(&walk, 0); + /* + * In practice, virtually all XTS plaintexts and ciphertexts are either + * 512 or 4096 bytes, aligned such that they don't span page boundaries. + * To optimize the performance of these cases, and also any other case + * where no page boundary is spanned, the below fast-path handles + * single-page sources and destinations as efficiently as possible. + */ + if (likely(src->length >= cryptlen && dst->length >= cryptlen && + src->offset + cryptlen <= PAGE_SIZE && + dst->offset + cryptlen <= PAGE_SIZE)) { + struct page *src_page = sg_page(src); + struct page *dst_page = sg_page(dst); + void *src_virt = kmap_local_page(src_page) + src->offset; + void *dst_virt = kmap_local_page(dst_page) + dst->offset; + + (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen, + req->iv); + kunmap_local(dst_virt); + kunmap_local(src_virt); + kernel_fpu_end(); + return 0; } - return err; + kernel_fpu_end(); + return xts_crypt_slowpath(req, crypt_func); +} + +static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]) +{ + aesni_enc(tweak_key, iv, iv); +} + +static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, unsigned int len, + u8 tweak[AES_BLOCK_SIZE]) +{ + aesni_xts_enc(key, dst, src, len, tweak); } -static int xts_encrypt(struct skcipher_request *req) +static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, unsigned int len, + u8 tweak[AES_BLOCK_SIZE]) { - return xts_crypt(req, true); + aesni_xts_dec(key, dst, src, len, tweak); } -static int xts_decrypt(struct skcipher_request *req) +static int xts_encrypt_aesni(struct skcipher_request *req) { - return xts_crypt(req, false); + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt); +} + +static int xts_decrypt_aesni(struct skcipher_request *req) +{ + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt); } static struct crypto_alg aesni_cipher_alg = { @@ -1103,9 +1131,9 @@ static struct skcipher_alg aesni_skciphers[] = { .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, + .setkey = xts_setkey_aesni, + .encrypt = xts_encrypt_aesni, + .decrypt = xts_decrypt_aesni, } }; @@ -1137,7 +1165,149 @@ static struct skcipher_alg aesni_xctr = { }; static struct simd_skcipher_alg *aesni_simd_xctr; -#endif /* CONFIG_X86_64 */ + +asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); + +#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ + \ +asmlinkage void \ +aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ +asmlinkage void \ +aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ + \ +static int xts_encrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ +} \ + \ +static int xts_decrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ +} \ + \ +static struct skcipher_alg aes_xts_alg_##suffix = { \ + .base = { \ + .cra_name = "__xts(aes)", \ + .cra_driver_name = "__" driver_name, \ + .cra_priority = priority, \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = AES_BLOCK_SIZE, \ + .cra_ctxsize = XTS_AES_CTX_SIZE, \ + .cra_module = THIS_MODULE, \ + }, \ + .min_keysize = 2 * AES_MIN_KEY_SIZE, \ + .max_keysize = 2 * AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .walksize = 2 * AES_BLOCK_SIZE, \ + .setkey = xts_setkey_aesni, \ + .encrypt = xts_encrypt_##suffix, \ + .decrypt = xts_decrypt_##suffix, \ +}; \ + \ +static struct simd_skcipher_alg *aes_xts_simdalg_##suffix + +DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); +DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); +DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); +#endif + +/* + * This is a list of CPU models that are known to suffer from downclocking when + * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS + * implementation with zmm registers won't be used by default. An + * implementation with ymm registers (256-bit vectors) will be used instead. + */ +static const struct x86_cpu_id zmm_exclusion_list[] = { + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE }, + /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ + /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */ + {}, +}; + +static int __init register_xts_algs(void) +{ + int err; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return 0; + err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, + &aes_xts_simdalg_aesni_avx); + if (err) + return err; +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_VAES) || + !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || + !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + return 0; + err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, + &aes_xts_simdalg_vaes_avx2); + if (err) + return err; + + if (!boot_cpu_has(X86_FEATURE_AVX512BW) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || + !boot_cpu_has(X86_FEATURE_BMI2) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + return 0; + + err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, + &aes_xts_simdalg_vaes_avx10_256); + if (err) + return err; + + if (x86_match_cpu(zmm_exclusion_list)) + aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; + + err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, + &aes_xts_simdalg_vaes_avx10_512); + if (err) + return err; +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + return 0; +} + +static void unregister_xts_algs(void) +{ + if (aes_xts_simdalg_aesni_avx) + simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, + &aes_xts_simdalg_aesni_avx); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (aes_xts_simdalg_vaes_avx2) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, + &aes_xts_simdalg_vaes_avx2); + if (aes_xts_simdalg_vaes_avx10_256) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, + &aes_xts_simdalg_vaes_avx10_256); + if (aes_xts_simdalg_vaes_avx10_512) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, + &aes_xts_simdalg_vaes_avx10_512); +#endif +} +#else /* CONFIG_X86_64 */ +static int __init register_xts_algs(void) +{ + return 0; +} + +static void unregister_xts_algs(void) +{ +} +#endif /* !CONFIG_X86_64 */ #ifdef CONFIG_X86_64 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, @@ -1146,7 +1316,8 @@ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); + aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, + ctx->hash_subkey); } static int generic_gcmaes_encrypt(struct aead_request *req) @@ -1276,13 +1447,21 @@ static int __init aesni_init(void) goto unregister_aeads; #endif /* CONFIG_X86_64 */ + err = register_xts_algs(); + if (err) + goto unregister_xts; + return 0; +unregister_xts: + unregister_xts_algs(); #ifdef CONFIG_X86_64 + if (aesni_simd_xctr) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); unregister_aeads: +#endif /* CONFIG_X86_64 */ simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), aesni_simd_aeads); -#endif /* CONFIG_X86_64 */ unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), @@ -1303,6 +1482,7 @@ static void __exit aesni_exit(void) if (boot_cpu_has(X86_FEATURE_AVX)) simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); #endif /* CONFIG_X86_64 */ + unregister_xts_algs(); } late_initcall(aesni_init); diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index ef73a3ab8726..791386d9a83a 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2) vpaddq T1, T0, T0 vpaddq T4, T0, T0 vmovdqu T0, (HASH) + vzeroupper RET SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9918212faf91..0ffb072be956 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -716,6 +716,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) popq %r13 popq %r12 popq %rbx + vzeroupper RET SYM_FUNC_END(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 537b6dcd7ed8..d515a55a3bc1 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -62,20 +62,41 @@ #define SHA256CONSTANTS %rax -#define MSG %xmm0 +#define MSG %xmm0 /* sha256rnds2 implicit operand */ #define STATE0 %xmm1 #define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define TMP %xmm7 #define SHUF_MASK %xmm8 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 +.macro do_4rounds i, m0, m1, m2, m3 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif + movdqa (\i-32)*4(SHA256CONSTANTS), MSG + paddd \m0, MSG + sha256rnds2 STATE0, STATE1 +.if \i >= 12 && \i < 60 + movdqa \m0, TMP + palignr $4, \m3, TMP + paddd TMP, \m1 + sha256msg2 \m0, \m1 +.endif + punpckhqdq MSG, MSG + sha256rnds2 STATE1, STATE0 +.if \i >= 4 && \i < 52 + sha256msg1 \m0, \m3 +.endif +.endm + /* * Intel SHA Extensions optimized implementation of a SHA-256 update function * @@ -86,9 +107,6 @@ * store partial blocks. All message padding and hash value initialization must * be done outside the update function. * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * * void sha256_ni_transform(uint32_t *digest, const void *data, uint32_t numBlocks); * digest : pointer to digest @@ -108,202 +126,29 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) * Need to reorder these appropriately * DCBA, HGFE -> ABEF, CDGH */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 + movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ + movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ - movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq TMP, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS + lea K256+32*4(%rip), SHA256CONSTANTS .Lloop0: /* Save hash values for addition after rounds */ movdqa STATE0, ABEF_SAVE movdqa STATE1, CDGH_SAVE - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - movdqa MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - movdqa MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - movdqa MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - movdqa MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - movdqa MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - movdqa MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - movdqa MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - movdqa MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - movdqa MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - movdqa MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 56-59 */ - movdqa MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 60-63 */ - movdqa MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 +.irp i, 0, 16, 32, 48 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 +.endr /* Add current hash values with previously saved */ paddd ABEF_SAVE, STATE0 @@ -315,14 +160,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) jne .Lloop0 /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ - movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ - - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq TMP, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ + + movdqu STATE1, 0*16(DIGEST_PTR) + movdqu STATE0, 1*16(DIGEST_PTR) .Ldone_hash: diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index f08496cd6870..24973f42c43f 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) pop %r12 pop %rbx + vzeroupper RET SYM_FUNC_END(sha512_transform_rorx) diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index c93e7f5c2a06..ce1cc1622385 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -17,7 +17,7 @@ obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +obj-$(CONFIG_PREEMPTION) += thunk.o CFLAGS_entry_fred.o += -fno-stack-protector CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf3..51cc9c7cb9bd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } @@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; @@ -255,6 +255,71 @@ DEFINE_IDTENTRY_RAW(int80_emulation) instrumentation_end(); syscall_exit_to_user_mode(regs); } + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif #else /* CONFIG_IA32_EMULATION */ /* Handles int $0x80 on a 32bit kernel */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8af2a26b24f6..1b5be07f8669 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index eabf48c4d4b4..11c9b8efdc4c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -7,7 +7,6 @@ #include <asm/asm-offsets.h> #include <asm/current.h> #include <asm/errno.h> -#include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> #include <asm/irqflags.h> @@ -92,6 +91,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY /* * SYSENTER doesn't filter flags, so we need to clear NT and AC @@ -206,6 +206,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -276,3 +277,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index ac120cbdaaf2..89c1476fcdd9 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -28,9 +28,9 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code if (regs->fred_cs.sl > 0) { pr_emerg("PANIC: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - die("invalid or fatal FRED event", regs, regs->orig_ax); + die("invalid or fatal FRED event", regs, error_code); panic("invalid or fatal FRED event"); } else { unsigned long flags = oops_begin(); @@ -38,10 +38,10 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code pr_alert("BUG: invalid or fatal FRED event; event type %u " "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", - regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + regs->fred_ss.type, regs->fred_ss.vector, error_code, fred_event_data(regs), regs->cs, regs->ip); - if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + if (__die("Invalid or fatal FRED event", regs, error_code)) sig = 0; oops_end(flags, regs, sig); @@ -66,7 +66,7 @@ static noinstr void fred_intx(struct pt_regs *regs) /* INT80 */ case IA32_SYSCALL_VECTOR: if (ia32_enabled()) - return int80_emulation(regs); + return fred_int80_emulation(regs); fallthrough; #endif diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..c2235bae17ef 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -18,8 +18,25 @@ #include <asm/syscalls_32.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 #define __SYSCALL(nr, sym) __ia32_##sym, - -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); + +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_32.h> + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..33b3f09e6f15 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -11,8 +11,23 @@ #include <asm/syscalls_64.h> #undef __SYSCALL +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ #define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t sys_call_table[] = { +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); + +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_64.h> + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265..03de4a932131 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -11,8 +11,12 @@ #include <asm/syscalls_x32.h> #undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include <asm/syscalls_x32.h> +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S index 119ebdc3d362..119ebdc3d362 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk.S diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S deleted file mode 100644 index da37f42f4549..000000000000 --- a/arch/x86/entry/thunk_32.S +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) - * Copyright 2008 by Steven Rostedt, Red Hat, Inc - * (inspired by Andi Kleen's thunk_64.S) - */ - -#include <linux/export.h> -#include <linux/linkage.h> -#include <asm/asm.h> - -#include "calling.h" - -THUNK preempt_schedule_thunk, preempt_schedule -THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace -EXPORT_SYMBOL(preempt_schedule_thunk) -EXPORT_SYMBOL(preempt_schedule_notrace_thunk) - diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 620f6257bbe9..3d64bcc403cf 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -9,7 +9,9 @@ include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KMSAN_SANITIZE_vclock_gettime.o := n +KMSAN_SANITIZE_vdso32/vclock_gettime.o := n KMSAN_SANITIZE_vgetcpu.o := n +KMSAN_SANITIZE_vdso32/vgetcpu.o := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n @@ -39,6 +41,7 @@ obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n +OBJECT_FILES_NON_STANDARD_vdso-image-x32.o := n OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index a3c0df11d0e6..2fb7d53cf333 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) static bool write_ok_or_segv(unsigned long ptr, size_t size) { - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_err, this could go away. - */ - if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; @@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { - struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; @@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code, goto sigsegv; } - tsk = current; - /* * Check for access_ok violations and find the syscall nr. * @@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code, goto do_ret; /* skip requested */ /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. + * With a real vsyscall, page faults cause SIGSEGV. */ - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; - current->thread.sig_on_uaccess_err = 1; - ret = -EFAULT; switch (vsyscall_nr) { case 0: @@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code, break; } - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; - check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + goto sigsegv; } regs->ax = ret; diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index aec16e581f5b..1fc4ce44e743 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -250,7 +250,7 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = /* * AMD Performance Monitor Family 17h and later: */ -static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -262,10 +262,39 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; +static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, +}; + +static const u64 amd_zen4_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x100000120, +}; + static u64 amd_pmu_event_map(int hw_event) { - if (boot_cpu_data.x86 >= 0x17) - return amd_f17h_perfmon_event_map[hw_event]; + if (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1a) + return amd_zen4_perfmon_event_map[hw_event]; + + if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19) + return amd_zen2_perfmon_event_map[hw_event]; + + if (cpu_feature_enabled(X86_FEATURE_ZEN1)) + return amd_zen1_perfmon_event_map[hw_event]; return amd_perfmon_event_map[hw_event]; } @@ -618,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu) } } -static inline void amd_pmu_set_global_ctl(u64 ctl) +static __always_inline void amd_pmu_set_global_ctl(u64 ctl) { wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } @@ -878,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) return amd_pmu_adjust_nmi_window(handled); } +/* + * AMD-specific callback invoked through perf_snapshot_branch_stack static + * call, defined in include/linux/perf_event.h. See its definition for API + * details. It's up to caller to provide enough space in *entries* to fit all + * LBR records, otherwise returned result will be truncated to *cnt* entries. + */ +static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + struct cpu_hw_events *cpuc; + unsigned long flags; + + /* + * The sequence of steps to freeze LBR should be completely inlined + * and contain no branches to minimize contamination of LBR snapshot + */ + local_irq_save(flags); + amd_pmu_core_disable_all(); + __amd_pmu_lbr_disable(); + + cpuc = this_cpu_ptr(&cpu_hw_events); + + amd_pmu_lbr_read(); + cnt = min(cnt, x86_pmu.lbr_nr); + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); + + amd_pmu_v2_enable_all(0); + local_irq_restore(flags); + + return cnt; +} + static int amd_pmu_v2_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -904,8 +964,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; - /* Read branch records before unfreezing */ - if (status & GLOBAL_STATUS_LBRS_FROZEN) { + /* Read branch records */ + if (x86_pmu.lbr_nr) { amd_pmu_lbr_read(); status &= ~GLOBAL_STATUS_LBRS_FROZEN; } @@ -1414,6 +1474,10 @@ static int __init amd_core_pmu_init(void) static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + + /* Only support branch_stack snapshot on perfmon v2 */ + if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq) + static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack); } else if (!amd_brs_init()) { /* * BRS requires special event constraints and flushing on ctxsw. diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 4a1e600314d5..19c7b76e21bc 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event) { int ret = 0; - /* LBR is not recommended in counting mode */ - if (!is_sampling_event(event)) - return -EINVAL; - ret = amd_pmu_lbr_setup_filter(event); if (!ret) event->attach_state |= PERF_ATTACH_SCHED_CB; @@ -402,26 +398,23 @@ void amd_pmu_lbr_enable_all(void) wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); } - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + } - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); } void amd_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 dbg_ctl, dbg_extn_cfg; if (!cpuc->lbr_users || !x86_pmu.lbr_nr) return; - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + __amd_pmu_lbr_disable(); } __init int amd_pmu_lbr_init(void) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 09050641ce5d..5b0dd07b1ef1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 326c8cd5aa2d..54eb142810fb 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -696,78 +696,78 @@ static const struct cstate_model srf_cstates __initconst = { static const struct x86_cpu_id intel_cstates_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates), - X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates), - X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates), - - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &adl_cstates), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &adl_cstates), + X86_MATCH_VFM(INTEL_NEHALEM, &nhm_cstates), + X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_cstates), + X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhm_cstates), + + X86_MATCH_VFM(INTEL_WESTMERE, &nhm_cstates), + X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_cstates), + X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhm_cstates), + + X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_cstates), + X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snb_cstates), + + X86_MATCH_VFM(INTEL_IVYBRIDGE, &snb_cstates), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &snb_cstates), + + X86_MATCH_VFM(INTEL_HASWELL, &snb_cstates), + X86_MATCH_VFM(INTEL_HASWELL_X, &snb_cstates), + X86_MATCH_VFM(INTEL_HASWELL_G, &snb_cstates), + + X86_MATCH_VFM(INTEL_HASWELL_L, &hswult_cstates), + + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates), + + X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates), + X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates), + X86_MATCH_VFM(INTEL_BROADWELL_G, &snb_cstates), + X86_MATCH_VFM(INTEL_BROADWELL_X, &snb_cstates), + + X86_MATCH_VFM(INTEL_SKYLAKE_L, &snb_cstates), + X86_MATCH_VFM(INTEL_SKYLAKE, &snb_cstates), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &snb_cstates), + + X86_MATCH_VFM(INTEL_KABYLAKE_L, &hswult_cstates), + X86_MATCH_VFM(INTEL_KABYLAKE, &hswult_cstates), + X86_MATCH_VFM(INTEL_COMETLAKE_L, &hswult_cstates), + X86_MATCH_VFM(INTEL_COMETLAKE, &hswult_cstates), + + X86_MATCH_VFM(INTEL_CANNONLAKE_L, &cnl_cstates), + + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_cstates), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_cstates), + + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_TREMONT, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &glm_cstates), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates), + + X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates), + X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates), + X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_cstates), + X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_cstates), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &icx_cstates), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates), + + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates), + X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates), + X86_MATCH_VFM(INTEL_ROCKETLAKE, &icl_cstates), + X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_cstates), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_cstates), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_cstates), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_cstates), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_cstates), + X86_MATCH_VFM(INTEL_METEORLAKE, &adl_cstates), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2641ba620f12..e010bfed8417 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1237,11 +1237,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu = event->pmu; /* - * Make sure we get updated with the first PEBS - * event. It will trigger also during removal, but - * that does not hurt: + * Make sure we get updated with the first PEBS event. + * During removal, ->pebs_data_cfg is still valid for + * the last PEBS event. Don't clear it. */ - if (cpuc->n_pebs == 1) + if ((cpuc->n_pebs == 1) && add) cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW; if (needed_cb != pebs_needs_sched_cb(cpuc)) { diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e..dc641b50814e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -2,6 +2,7 @@ #include <linux/perf_event.h> #include <linux/types.h> +#include <asm/cpu_device_id.h> #include <asm/perf_event.h> #include <asm/msr.h> @@ -1457,7 +1458,7 @@ void __init intel_pmu_lbr_init_atom(void) * to have an operational LBR which can freeze * on PMU interrupt */ - if (boot_cpu_data.x86_model == 28 + if (boot_cpu_data.x86_vfm == INTEL_ATOM_BONNELL && boot_cpu_data.x86_stepping < 10) { pr_cont("LBR disabled due to erratum"); return; @@ -1693,6 +1694,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; lbr->info = x86_pmu.lbr_info; + lbr->has_callstack = x86_pmu_has_lbr_callstack(); } EXPORT_SYMBOL_GPL(x86_perf_get_lbr); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 8e2a12235e62..14db6d9d318b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -22,7 +22,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/intel_pt.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include "../perf_event.h" #include "pt.h" @@ -211,11 +211,11 @@ static int __init pt_pmu_hw_init(void) } /* model-specific quirks */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL: - case INTEL_FAM6_BROADWELL_D: - case INTEL_FAM6_BROADWELL_G: - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL: + case INTEL_BROADWELL_D: + case INTEL_BROADWELL_G: + case INTEL_BROADWELL_X: /* not setting BRANCH_EN will #GP, erratum BDM106 */ pt_pmu.branch_en_always_on = true; break; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 258e2cdf28fa..419c517b8594 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1829,56 +1829,56 @@ static const struct intel_uncore_init_fun generic_uncore_init __initconst = { }; static const struct x86_cpu_id intel_uncore_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_uncore_init), + X86_MATCH_VFM(INTEL_NEHALEM, &nhm_uncore_init), + X86_MATCH_VFM(INTEL_WESTMERE, &nhm_uncore_init), + X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_uncore_init), + X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_uncore_init), + X86_MATCH_VFM(INTEL_IVYBRIDGE, &ivb_uncore_init), + X86_MATCH_VFM(INTEL_HASWELL, &hsw_uncore_init), + X86_MATCH_VFM(INTEL_HASWELL_L, &hsw_uncore_init), + X86_MATCH_VFM(INTEL_HASWELL_G, &hsw_uncore_init), + X86_MATCH_VFM(INTEL_BROADWELL, &bdw_uncore_init), + X86_MATCH_VFM(INTEL_BROADWELL_G, &bdw_uncore_init), + X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snbep_uncore_init), + X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhmex_uncore_init), + X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhmex_uncore_init), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ivbep_uncore_init), + X86_MATCH_VFM(INTEL_HASWELL_X, &hswep_uncore_init), + X86_MATCH_VFM(INTEL_BROADWELL_X, &bdx_uncore_init), + X86_MATCH_VFM(INTEL_BROADWELL_D, &bdx_uncore_init), + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_uncore_init), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_uncore_init), + X86_MATCH_VFM(INTEL_SKYLAKE, &skl_uncore_init), + X86_MATCH_VFM(INTEL_SKYLAKE_L, &skl_uncore_init), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_uncore_init), + X86_MATCH_VFM(INTEL_KABYLAKE_L, &skl_uncore_init), + X86_MATCH_VFM(INTEL_KABYLAKE, &skl_uncore_init), + X86_MATCH_VFM(INTEL_COMETLAKE_L, &skl_uncore_init), + X86_MATCH_VFM(INTEL_COMETLAKE, &skl_uncore_init), + X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_uncore_init), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &icl_uncore_init), + X86_MATCH_VFM(INTEL_ICELAKE, &icl_uncore_init), + X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_uncore_init), + X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_uncore_init), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, &tgl_l_uncore_init), + X86_MATCH_VFM(INTEL_TIGERLAKE, &tgl_uncore_init), + X86_MATCH_VFM(INTEL_ROCKETLAKE, &rkl_uncore_init), + X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_uncore_init), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_uncore_init), + X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_uncore_init), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init), + X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init), + X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &snr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 92da8aaa5966..466833478e81 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem-EX/Westmere-EX uncore support */ +#include <asm/cpu_device_id.h> #include "uncore.h" /* NHM-EX event control */ @@ -1217,7 +1218,7 @@ static struct intel_uncore_type *nhmex_msr_uncores[] = { void nhmex_uncore_cpu_init(void) { - if (boot_cpu_data.x86_model == 46) + if (boot_cpu_data.x86_vfm == INTEL_NEHALEM_EX) uncore_nhmex = true; else nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2eaf0f339849..74b8b21e8990 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* SandyBridge-EP/IvyTown uncore support */ +#include <asm/cpu_device_id.h> #include "uncore.h" #include "uncore_discovery.h" @@ -3285,7 +3286,7 @@ void bdx_uncore_cpu_init(void) uncore_msr_uncores = bdx_msr_uncores; /* Detect systems with no SBOXes */ - if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_D || hswep_has_limit_sbox(BDX_PCU_DID)) uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; @@ -5394,7 +5395,7 @@ static int icx_iio_get_topology(struct intel_uncore_type *type) static void icx_iio_set_mapping(struct intel_uncore_type *type) { /* Detect ICX-D system. This case is not supported */ - if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) { + if (boot_cpu_data.x86_vfm == INTEL_ICELAKE_D) { pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group); return; } diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 9e237b30f017..45b1866ff051 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -2,7 +2,7 @@ #include <linux/perf_event.h> #include <linux/sysfs.h> #include <linux/nospec.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include "probe.h" enum perf_msr_id { @@ -43,75 +43,75 @@ static bool test_intel(int idx, void *data) boot_cpu_data.x86 != 6) return false; - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_NEHALEM_G: - case INTEL_FAM6_NEHALEM_EP: - case INTEL_FAM6_NEHALEM_EX: - - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_WESTMERE_EP: - case INTEL_FAM6_WESTMERE_EX: - - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_SANDYBRIDGE_X: - - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_IVYBRIDGE_X: - - case INTEL_FAM6_HASWELL: - case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_HASWELL_L: - case INTEL_FAM6_HASWELL_G: - - case INTEL_FAM6_BROADWELL: - case INTEL_FAM6_BROADWELL_D: - case INTEL_FAM6_BROADWELL_G: - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - case INTEL_FAM6_EMERALDRAPIDS_X: - case INTEL_FAM6_GRANITERAPIDS_X: - case INTEL_FAM6_GRANITERAPIDS_D: - - case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_D: - case INTEL_FAM6_ATOM_AIRMONT: - - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_D: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_TREMONT_D: - case INTEL_FAM6_ATOM_TREMONT: - case INTEL_FAM6_ATOM_TREMONT_L: - - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM: + case INTEL_NEHALEM_G: + case INTEL_NEHALEM_EP: + case INTEL_NEHALEM_EX: + + case INTEL_WESTMERE: + case INTEL_WESTMERE_EP: + case INTEL_WESTMERE_EX: + + case INTEL_SANDYBRIDGE: + case INTEL_SANDYBRIDGE_X: + + case INTEL_IVYBRIDGE: + case INTEL_IVYBRIDGE_X: + + case INTEL_HASWELL: + case INTEL_HASWELL_X: + case INTEL_HASWELL_L: + case INTEL_HASWELL_G: + + case INTEL_BROADWELL: + case INTEL_BROADWELL_D: + case INTEL_BROADWELL_G: + case INTEL_BROADWELL_X: + case INTEL_SAPPHIRERAPIDS_X: + case INTEL_EMERALDRAPIDS_X: + case INTEL_GRANITERAPIDS_X: + case INTEL_GRANITERAPIDS_D: + + case INTEL_ATOM_SILVERMONT: + case INTEL_ATOM_SILVERMONT_D: + case INTEL_ATOM_AIRMONT: + + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_D: + case INTEL_ATOM_GOLDMONT_PLUS: + case INTEL_ATOM_TREMONT_D: + case INTEL_ATOM_TREMONT: + case INTEL_ATOM_TREMONT_L: + + case INTEL_XEON_PHI_KNL: + case INTEL_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: - case INTEL_FAM6_COMETLAKE_L: - case INTEL_FAM6_COMETLAKE: - case INTEL_FAM6_ICELAKE_L: - case INTEL_FAM6_ICELAKE: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_ICELAKE_D: - case INTEL_FAM6_TIGERLAKE_L: - case INTEL_FAM6_TIGERLAKE: - case INTEL_FAM6_ROCKETLAKE: - case INTEL_FAM6_ALDERLAKE: - case INTEL_FAM6_ALDERLAKE_L: - case INTEL_FAM6_ATOM_GRACEMONT: - case INTEL_FAM6_RAPTORLAKE: - case INTEL_FAM6_RAPTORLAKE_P: - case INTEL_FAM6_RAPTORLAKE_S: - case INTEL_FAM6_METEORLAKE: - case INTEL_FAM6_METEORLAKE_L: + case INTEL_SKYLAKE_L: + case INTEL_SKYLAKE: + case INTEL_SKYLAKE_X: + case INTEL_KABYLAKE_L: + case INTEL_KABYLAKE: + case INTEL_COMETLAKE_L: + case INTEL_COMETLAKE: + case INTEL_ICELAKE_L: + case INTEL_ICELAKE: + case INTEL_ICELAKE_X: + case INTEL_ICELAKE_D: + case INTEL_TIGERLAKE_L: + case INTEL_TIGERLAKE: + case INTEL_ROCKETLAKE: + case INTEL_ALDERLAKE: + case INTEL_ALDERLAKE_L: + case INTEL_ATOM_GRACEMONT: + case INTEL_RAPTORLAKE: + case INTEL_RAPTORLAKE_P: + case INTEL_RAPTORLAKE_S: + case INTEL_METEORLAKE: + case INTEL_METEORLAKE_L: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index fb56518356ec..72b022a1e16c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); +static __always_inline void __amd_pmu_lbr_disable(void) +{ + u64 dbg_ctl, dbg_extn_cfg; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + } +} + #ifdef CONFIG_PERF_EVENTS_AMD_BRS #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index fb2b1961e5a3..ca5f687fa420 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { int maxdie = topology_max_packages() * topology_max_dies_per_package(); - size_t size; - size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); - rapl_pmus = kzalloc(size, GFP_KERNEL); + rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; @@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5fc45543e955..0569f579338b 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +155,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; @@ -181,7 +181,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -218,7 +218,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, } status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); + ipi_arg.cpu_mask); return hv_result_success(status); do_ex_hypercall: @@ -241,7 +241,7 @@ static bool __send_ipi_one(int cpu, int vector) return false; } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; if (vp >= 64) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 8f3a4d16bb79..17a71e92a343 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -667,14 +667,14 @@ void hyperv_cleanup(void) hv_hypercall_pg = NULL; /* Reset the hypercall page */ - hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL); + hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL); hypercall_msr.enable = 0; - hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hv_set_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* Reset the TSC page */ - tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC); + tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC); tsc_msr.enable = 0; - hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + hv_set_msr(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c index 68a0843d4750..3fa1f2ee7b0d 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/arch/x86/hyperv/hv_proc.c @@ -3,7 +3,6 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> -#include <linux/acpi.h> #include <linux/hyperv.h> #include <linux/slab.h> #include <linux/cpuhotplug.h> @@ -116,12 +115,11 @@ free_buf: int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { - struct hv_add_logical_processor_in *input; - struct hv_add_logical_processor_out *output; + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; u64 status; unsigned long flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* * When adding a logical processor, the hypervisor may return @@ -137,11 +135,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input->lp_index = lp_index; input->apic_id = apic_id; - input->flags = 0; - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, input, output); local_irq_restore(flags); @@ -166,7 +160,6 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) u64 status; unsigned long irq_flags; int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); /* Root VPs don't seem to need pages deposited */ if (partition_id != hv_current_partition_id) { @@ -185,14 +178,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) input->vp_index = vp_index; input->flags = flags; input->subnode_type = HvSubnodeAny; - if (node != NUMA_NO_NODE) { - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - } else { - input->proximity_domain_info.as_uint64 = 0; - } + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 737d6f7a6155..151e851bef09 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -16,7 +16,7 @@ #include <asm/paravirt.h> #include <asm/apic.h> -static bool __initdata hv_pvspin = true; +static bool hv_pvspin __initdata = true; static void hv_qlock_kick(int cpu) { @@ -64,6 +64,7 @@ __visible bool hv_vcpu_is_preempted(int vcpu) { return false; } + PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted); void __init hv_init_spinlocks(void) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index edd2f35b2a5e..5c7de79423b8 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -12,6 +12,7 @@ #include <asm/i8259.h> #include <asm/mshyperv.h> #include <asm/realmode.h> +#include <../kernel/smpboot.h> extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; @@ -65,7 +66,7 @@ static void hv_vtl_ap_entry(void) ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params); } -static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) +static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) { u64 status; int ret = 0; @@ -79,7 +80,9 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) struct ldttss_desc *ldt; struct desc_struct *gdt; - u64 rsp = current->thread.sp; + struct task_struct *idle = idle_thread_get(cpu); + u64 rsp = (unsigned long)idle->thread.sp; + u64 rip = (u64)&hv_vtl_ap_entry; native_store_gdt(&gdt_ptr); @@ -206,7 +209,15 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id) static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { - int vp_id; + int vp_id, cpu; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apicid)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid); vp_id = hv_vtl_apicid_to_vp_id(apicid); @@ -220,7 +231,7 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) return -EINVAL; } - return hv_vtl_bringup_vcpu(vp_id, start_eip); + return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip); } int __init hv_vtl_early_init(void) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index fcd20c6dc7f9..0cb2396de066 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -117,7 +117,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); -extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -128,7 +128,7 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) return dest; } static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, - void *func) + void *func, void *ip) { return 0; } @@ -294,10 +294,10 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, if CPU has feature1, newinstr1 is used. * Otherwise, oldinstr is used. */ -#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \ - ft_flags2, input...) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \ - newinstr2, ft_flags2) \ +#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \ + ft_flags2, input...) \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \ + newinstr2, ft_flags2) \ : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ @@ -307,7 +307,7 @@ static inline int alternatives_text_reserved(void *start, void *end) /* Like alternative_io, but for replacing a direct call with another one. */ #define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ - asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \ + asm_inline volatile (ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) /* @@ -316,12 +316,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, if CPU has feature1, function1 is used. * Otherwise, old function is used. */ -#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ - output, input...) \ - asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\ - "call %P[new2]", ft_flags2) \ - : output, ASM_CALL_CONSTRAINT \ - : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ +#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ + output, input...) \ + asm_inline volatile (ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ + "call %c[new2]", ft_flags2) \ + : output, ASM_CALL_CONSTRAINT \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input) /* diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 94ce0f7c9d3a..839c58f95114 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -13,6 +13,7 @@ #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> +#include <asm/io.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -91,14 +92,14 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, + alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); } static inline void native_apic_mem_eoi(void) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 076bf8dee702..25466c4d2134 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -14,6 +14,7 @@ #include <asm/asm.h> #include <asm/fred.h> #include <asm/gsseg.h> +#include <asm/nospec-branch.h> #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index ca8eed1d496a..2bec0c89a95c 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -229,9 +229,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) -#define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) - #define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 55a55ec04350..55b4d24356ea 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) } #define arch_atomic_add_return arch_atomic_add_return -static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) -{ - return arch_atomic_add_return(-i, v); -} -#define arch_atomic_sub_return arch_atomic_sub_return +#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { @@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) } #define arch_atomic_fetch_add arch_atomic_fetch_add -static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic_fetch_sub arch_atomic_fetch_sub +#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 3486d91b8595..8db2ec4d6cda 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,6 +14,32 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +/* + * Read an atomic64_t non-atomically. + * + * This is intended to be used in cases where a subsequent atomic operation + * will handle the torn value, and can be used to prime the first iteration + * of unconditional try_cmpxchg() loops, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + * + * This is NOT safe to use where the value is not always checked by a + * subsequent atomic operation, such as in conditional try_cmpxchg() loops + * that can break before the atomic operation, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { + * if (condition(val)) + * break; + * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + */ +static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) +{ + /* See comment in arch_atomic_read(). */ + return __READ_ONCE(v->counter); +} + #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) #ifndef ATOMIC64_EXPORT #define ATOMIC64_DECL_ONE __ATOMIC64_DECL @@ -24,7 +50,7 @@ typedef struct { #ifdef CONFIG_X86_CMPXCHG64 #define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %P[func]" \ + asm volatile("call %c[func]" \ : out : [func] "i" (atomic64_##g##_cx8), ## in) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) @@ -61,12 +87,18 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return arch_cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg64(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; @@ -195,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); - return old; + return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); - return old; + return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); - return old; + return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i)); - return old; + return val; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 3165c0feedf7..ae12acae5b06 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) } #define arch_atomic64_add_return arch_atomic64_add_return -static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) -{ - return arch_atomic64_add_return(-i, v); -} -#define arch_atomic64_sub_return arch_atomic64_sub_return +#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { @@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index fe1e7e3cc844..63bdc6b85219 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -79,6 +79,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) + #include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index a3e0be0470a4..3e5b111e619d 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -6,11 +6,6 @@ #include <asm/pgtable_types.h> #include <uapi/asm/boot.h> -/* Physical address where kernel should be loaded. */ -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) - /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 # define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index b5731c51f0f4..ed2797f132ce 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,103 +3,150 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, + * Note: if you use __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ -#ifdef CONFIG_X86_CMPXCHG64 -#define arch_cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_try_cmpxchg64(ptr, po, n) \ - __try_cmpxchg64((ptr), (unsigned long long *)(po), \ - (unsigned long long)(n)) -#endif +union __u64_halves { + u64 full; + struct { + u32 low, high; + }; +}; + +#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + + +static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX); +} -static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - u64 prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_cmpxchg64(ptr, old, new,); } -static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - u64 prev; - asm volatile("cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } -static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) { - bool success; - u64 old = *pold; - asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" - CC_SET(z) - : CC_OUT(z) (success), - [ptr] "+m" (*ptr), - "+A" (old) - : "b" ((u32)new), - "c" ((u32)(new >> 32)) - : "memory"); - - if (unlikely(!success)) - *pold = old; - return success; + return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifndef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CMPXCHG64 + +#define arch_cmpxchg64 __cmpxchg64 + +#define arch_cmpxchg64_local __cmpxchg64_local + +#define arch_try_cmpxchg64 __try_cmpxchg64 + +#define arch_try_cmpxchg64_local __try_cmpxchg64_local + +#else + /* * Building a kernel capable running on 80386 and 80486. It may be necessary * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define arch_cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io(LOCK_PREFIX_HERE \ - "call cmpxchg8b_emu", \ - "lock; cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) - - -#define arch_cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io("call cmpxchg8b_emu", \ - "cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); +} +#define arch_cmpxchg64 arch_cmpxchg64 + +static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, ,); +} +#define arch_cmpxchg64_local arch_cmpxchg64_local + +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); +} +#define arch_try_cmpxchg64 arch_try_cmpxchg64 + +static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,); +} +#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local #endif diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 44b08b53ab32..5e241306db26 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,12 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) +#define arch_try_cmpxchg64_local(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg_local((ptr), (po), (n)); \ +}) + union __u128_halves { u128 full; struct { @@ -62,7 +68,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, asm volatile(_lock "cmpxchg16b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ - [ptr] "+m" (*ptr), \ + [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ : "memory"); \ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index fb7388bbc212..aa6c8f8ca958 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -22,8 +22,10 @@ static inline void cc_set_mask(u64 mask) u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); +void cc_random_init(void); #else #define cc_vendor (CC_VENDOR_NONE) +static const u64 cc_mask = 0; static inline u64 cc_mkenc(u64 val) { @@ -34,6 +36,7 @@ static inline u64 cc_mkdec(u64 val) { return val; } +static inline void cc_random_init(void) { } #endif #endif /* _ASM_X86_COCO_H */ diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index eb8fcede9e3b..970a232009c3 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -3,6 +3,39 @@ #define _ASM_X86_CPU_DEVICE_ID /* + * Can't use <linux/bitfield.h> because it generates expressions that + * cannot be used in structure initializers. Bitfield construction + * here must match the union in struct cpuinfo_86: + * union { + * struct { + * __u8 x86_model; + * __u8 x86; + * __u8 x86_vendor; + * __u8 x86_reserved; + * }; + * __u32 x86_vfm; + * }; + */ +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) + +/* * Declare drivers belonging to specific x86 CPUs * Similar in spirit to pci_device_id and related PCI functions * @@ -49,6 +82,16 @@ .driver_data = (unsigned long) _data \ } +#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ + .vendor = _vendor, \ + .family = _family, \ + .model = _model, \ + .steppings = _steppings, \ + .feature = _feature, \ + .driver_data = (unsigned long) _data \ +} + /** * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY @@ -164,6 +207,56 @@ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ steppings, X86_FEATURE_ANY, data) +/** + * X86_MATCH_VFM - Match encoded vendor/family/model + * @vfm: Encoded 8-bits each for vendor, family, model + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + * + * Stepping and feature are set to wildcards + */ +#define X86_MATCH_VFM(vfm, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, data) + +/** + * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping + * @vfm: Encoded 8-bits each for vendor, family, model + * @steppings: Bitmask of steppings to match + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + * + * feature is set to wildcard + */ +#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + steppings, X86_FEATURE_ANY, data) + +/** + * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature + * @vfm: Encoded 8-bits each for vendor, family, model + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + * + * Steppings is set to wildcard + */ +#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + X86_STEPPING_ANY, feature, data) + /* * Match specific microcode revisions. * @@ -190,6 +283,14 @@ struct x86_cpu_desc { .x86_microcode_rev = (revision), \ } +#define AMD_CPU_DESC(fam, model, stepping, revision) { \ + .x86_family = (fam), \ + .x86_vendor = X86_VENDOR_AMD, \ + .x86_model = (model), \ + .x86_stepping = (stepping), \ + .x86_microcode_rev = (revision), \ +} + extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a1273698fc43..0325424c87f3 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -33,6 +33,8 @@ enum cpuid_leafs CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, + CPUID_LNX_5, + NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" @@ -91,8 +93,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -116,8 +119,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ @@ -168,11 +172,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm goto( - ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" - " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n" + " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f0337f7bcf16..3c7434329661 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -460,6 +460,18 @@ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ /* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc and Linux defined features. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ + +/* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) @@ -507,4 +519,5 @@ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h index 152239f95541..7835b2cdff04 100644 --- a/arch/x86/include/asm/crash_reserve.h +++ b/arch/x86/include/asm/crash_reserve.h @@ -39,4 +39,6 @@ static inline unsigned long crash_low_size_default(void) #endif } +#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + #endif /* _X86_CRASH_RESERVE_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index da4054fbf533..c492bdc97b05 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -155,6 +155,7 @@ #define DISABLED_MASK18 (DISABLE_IBT) #define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..2e74a7f0e935 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 7acf0383be80..906b0d5541e8 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -36,7 +36,7 @@ #define EX_TYPE_DEFAULT 1 #define EX_TYPE_FAULT 2 #define EX_TYPE_UACCESS 3 -#define EX_TYPE_COPY 4 +/* unused, was: #define EX_TYPE_COPY 4 */ #define EX_TYPE_CLEAR_FS 5 #define EX_TYPE_FPU_RESTORE 6 #define EX_TYPE_BPF 7 diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2ff26f53cd62..3787d26810c1 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -182,7 +182,7 @@ enum hv_isolation_type { #define HV_X64_MSR_HYPERCALL 0x40000001 /* MSR used to provide vcpu index */ -#define HV_REGISTER_VP_INDEX 0x40000002 +#define HV_X64_MSR_VP_INDEX 0x40000002 /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 @@ -191,10 +191,10 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_RUNTIME 0x40000010 /* MSR used to read the per-partition time reference counter */ -#define HV_REGISTER_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 /* A partition's reference time stamp counter (TSC) page */ -#define HV_REGISTER_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 /* MSR used to retrieve the TSC frequency */ #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 @@ -209,61 +209,61 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 /* Define synthetic interrupt controller model specific registers. */ -#define HV_REGISTER_SCONTROL 0x40000080 -#define HV_REGISTER_SVERSION 0x40000081 -#define HV_REGISTER_SIEFP 0x40000082 -#define HV_REGISTER_SIMP 0x40000083 -#define HV_REGISTER_EOM 0x40000084 -#define HV_REGISTER_SINT0 0x40000090 -#define HV_REGISTER_SINT1 0x40000091 -#define HV_REGISTER_SINT2 0x40000092 -#define HV_REGISTER_SINT3 0x40000093 -#define HV_REGISTER_SINT4 0x40000094 -#define HV_REGISTER_SINT5 0x40000095 -#define HV_REGISTER_SINT6 0x40000096 -#define HV_REGISTER_SINT7 0x40000097 -#define HV_REGISTER_SINT8 0x40000098 -#define HV_REGISTER_SINT9 0x40000099 -#define HV_REGISTER_SINT10 0x4000009A -#define HV_REGISTER_SINT11 0x4000009B -#define HV_REGISTER_SINT12 0x4000009C -#define HV_REGISTER_SINT13 0x4000009D -#define HV_REGISTER_SINT14 0x4000009E -#define HV_REGISTER_SINT15 0x4000009F +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F /* * Define synthetic interrupt controller model specific registers for * nested hypervisor. */ -#define HV_REGISTER_NESTED_SCONTROL 0x40001080 -#define HV_REGISTER_NESTED_SVERSION 0x40001081 -#define HV_REGISTER_NESTED_SIEFP 0x40001082 -#define HV_REGISTER_NESTED_SIMP 0x40001083 -#define HV_REGISTER_NESTED_EOM 0x40001084 -#define HV_REGISTER_NESTED_SINT0 0x40001090 +#define HV_X64_MSR_NESTED_SCONTROL 0x40001080 +#define HV_X64_MSR_NESTED_SVERSION 0x40001081 +#define HV_X64_MSR_NESTED_SIEFP 0x40001082 +#define HV_X64_MSR_NESTED_SIMP 0x40001083 +#define HV_X64_MSR_NESTED_EOM 0x40001084 +#define HV_X64_MSR_NESTED_SINT0 0x40001090 /* * Synthetic Timer MSRs. Four timers per vcpu. */ -#define HV_REGISTER_STIMER0_CONFIG 0x400000B0 -#define HV_REGISTER_STIMER0_COUNT 0x400000B1 -#define HV_REGISTER_STIMER1_CONFIG 0x400000B2 -#define HV_REGISTER_STIMER1_COUNT 0x400000B3 -#define HV_REGISTER_STIMER2_CONFIG 0x400000B4 -#define HV_REGISTER_STIMER2_COUNT 0x400000B5 -#define HV_REGISTER_STIMER3_CONFIG 0x400000B6 -#define HV_REGISTER_STIMER3_COUNT 0x400000B7 +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 /* Hyper-V guest idle MSR */ #define HV_X64_MSR_GUEST_IDLE 0x400000F0 /* Hyper-V guest crash notification MSR's */ -#define HV_REGISTER_CRASH_P0 0x40000100 -#define HV_REGISTER_CRASH_P1 0x40000101 -#define HV_REGISTER_CRASH_P2 0x40000102 -#define HV_REGISTER_CRASH_P3 0x40000103 -#define HV_REGISTER_CRASH_P4 0x40000104 -#define HV_REGISTER_CRASH_CTL 0x40000105 +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 /* TSC emulation after migration */ #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 @@ -276,31 +276,38 @@ enum hv_isolation_type { /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ #define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) -/* Register name aliases for temporary compatibility */ -#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT -#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG -#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT -#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG -#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT -#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG -#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT -#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG -#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL -#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION -#define HV_X64_MSR_SIMP HV_REGISTER_SIMP -#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP -#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX -#define HV_X64_MSR_EOM HV_REGISTER_EOM -#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0 -#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15 -#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0 -#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1 -#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2 -#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3 -#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4 -#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL -#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT -#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC +/* + * To support arch-generic code calling hv_set/get_register: + * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl + * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall + */ +#define HV_MSR_CRASH_P0 (HV_X64_MSR_CRASH_P0) +#define HV_MSR_CRASH_P1 (HV_X64_MSR_CRASH_P1) +#define HV_MSR_CRASH_P2 (HV_X64_MSR_CRASH_P2) +#define HV_MSR_CRASH_P3 (HV_X64_MSR_CRASH_P3) +#define HV_MSR_CRASH_P4 (HV_X64_MSR_CRASH_P4) +#define HV_MSR_CRASH_CTL (HV_X64_MSR_CRASH_CTL) + +#define HV_MSR_VP_INDEX (HV_X64_MSR_VP_INDEX) +#define HV_MSR_TIME_REF_COUNT (HV_X64_MSR_TIME_REF_COUNT) +#define HV_MSR_REFERENCE_TSC (HV_X64_MSR_REFERENCE_TSC) + +#define HV_MSR_SINT0 (HV_X64_MSR_SINT0) +#define HV_MSR_SVERSION (HV_X64_MSR_SVERSION) +#define HV_MSR_SCONTROL (HV_X64_MSR_SCONTROL) +#define HV_MSR_SIEFP (HV_X64_MSR_SIEFP) +#define HV_MSR_SIMP (HV_X64_MSR_SIMP) +#define HV_MSR_EOM (HV_X64_MSR_EOM) + +#define HV_MSR_NESTED_SCONTROL (HV_X64_MSR_NESTED_SCONTROL) +#define HV_MSR_NESTED_SVERSION (HV_X64_MSR_NESTED_SVERSION) +#define HV_MSR_NESTED_SIEFP (HV_X64_MSR_NESTED_SIEFP) +#define HV_MSR_NESTED_SIMP (HV_X64_MSR_NESTED_SIMP) +#define HV_MSR_NESTED_EOM (HV_X64_MSR_NESTED_EOM) +#define HV_MSR_NESTED_SINT0 (HV_X64_MSR_NESTED_SINT0) + +#define HV_MSR_STIMER0_CONFIG (HV_X64_MSR_STIMER0_CONFIG) +#define HV_MSR_STIMER0_COUNT (HV_X64_MSR_STIMER0_COUNT) /* * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 4212c00c9708..9d69f3f8dbab 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -56,17 +56,6 @@ struct stat64 { unsigned long long st_ino; } __attribute__((packed)); -#define IA32_STACK_TOP IA32_PAGE_OFFSET - -#ifdef __KERNEL__ -struct linux_binprm; -extern int ia32_setup_arg_pages(struct linux_binprm *bprm, - unsigned long stack_top, int exec_stack); -struct mm_struct; -extern void ia32_pick_mmap_layout(struct mm_struct *mm); - -#endif - extern bool __ia32_enabled; static __always_inline bool ia32_enabled(void) diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h deleted file mode 100644 index aa065c94ccf5..000000000000 --- a/arch/x86/include/asm/ia32_unistd.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IA32_UNISTD_H -#define _ASM_X86_IA32_UNISTD_H - -/* - * This file contains the system call numbers of the ia32 compat ABI, - * this is for the kernel only. - */ -#define __SYSCALL_ia32_NR(x) (x) -#include <asm/unistd_32_ia32.h> - -#endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index b65e9c46b922..f81a851c46dc 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -40,136 +40,221 @@ * their own names :-( */ +#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) + /* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ #define INTEL_FAM6_ANY X86_MODEL_ANY +/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ +#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) #define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_CORE_YONAH IFM(6, 0x0E) #define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_CORE2_MEROM IFM(6, 0x0F) #define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_CORE2_MEROM_L IFM(6, 0x16) #define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_CORE2_PENRYN IFM(6, 0x17) #define INTEL_FAM6_CORE2_DUNNINGTON 0x1D +#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) #define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_NEHALEM IFM(6, 0x1E) #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ +#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_NEHALEM_EP IFM(6, 0x1A) #define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_NEHALEM_EX IFM(6, 0x2E) #define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_WESTMERE IFM(6, 0x25) #define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_WESTMERE_EP IFM(6, 0x2C) #define INTEL_FAM6_WESTMERE_EX 0x2F +#define INTEL_WESTMERE_EX IFM(6, 0x2F) #define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_SANDYBRIDGE IFM(6, 0x2A) #define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) #define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_IVYBRIDGE IFM(6, 0x3A) #define INTEL_FAM6_IVYBRIDGE_X 0x3E +#define INTEL_IVYBRIDGE_X IFM(6, 0x3E) #define INTEL_FAM6_HASWELL 0x3C +#define INTEL_HASWELL IFM(6, 0x3C) #define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_HASWELL_X IFM(6, 0x3F) #define INTEL_FAM6_HASWELL_L 0x45 +#define INTEL_HASWELL_L IFM(6, 0x45) #define INTEL_FAM6_HASWELL_G 0x46 +#define INTEL_HASWELL_G IFM(6, 0x46) #define INTEL_FAM6_BROADWELL 0x3D +#define INTEL_BROADWELL IFM(6, 0x3D) #define INTEL_FAM6_BROADWELL_G 0x47 +#define INTEL_BROADWELL_G IFM(6, 0x47) #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_BROADWELL_X IFM(6, 0x4F) #define INTEL_FAM6_BROADWELL_D 0x56 +#define INTEL_BROADWELL_D IFM(6, 0x56) #define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ +#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ #define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ +#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ #define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ +#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ #define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ +#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ #define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ +#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ #define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ +#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ #define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ +#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ #define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ +#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ #define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ +#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ +#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ +#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ +#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ +#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ +#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ +#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ #define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ +#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ #define INTEL_FAM6_EMERALDRAPIDS_X 0xCF +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) #define INTEL_FAM6_GRANITERAPIDS_X 0xAD +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) #define INTEL_FAM6_GRANITERAPIDS_D 0xAE +#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ #define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ #define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ +#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ #define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_FAM6_RAPTORLAKE_S 0xBF +#define INTEL_RAPTORLAKE_S IFM(6, 0xBF) #define INTEL_FAM6_METEORLAKE 0xAC +#define INTEL_METEORLAKE IFM(6, 0xAC) #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_METEORLAKE_L IFM(6, 0xAA) #define INTEL_FAM6_ARROWLAKE_H 0xC5 +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) #define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_ARROWLAKE IFM(6, 0xC6) +#define INTEL_FAM6_ARROWLAKE_U 0xB5 +#define INTEL_ARROWLAKE_U IFM(6, 0xB5) #define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ +#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ #define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ +#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ #define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ +#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ #define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ +#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ #define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ +#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ +#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ +#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ #define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ +#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ +#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ +#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ +#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ +#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ #define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ +#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ +#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ #define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ +#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ #define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ +#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ #define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ +#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ #define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ +#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ +#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ +#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 798183867d78..b71ad173f877 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -100,7 +100,7 @@ } #define ASM_CALL_ARG0 \ - "call %P[__func] \n" \ + "call %c[__func] \n" \ ASM_REACHABLE #define ASM_CALL_ARG1 \ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 16e07a2eee19..6efd1497b026 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -855,6 +855,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + bool is_amd_compatible; /* * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index de3118305838..dfd2e9699bd7 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -13,6 +13,7 @@ #define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ #define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ #define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) @@ -25,6 +26,7 @@ #define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ #define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ +#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */ /* MCG_EXT_CTL register defines */ #define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ce4ce8720d55..390c4d13956d 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -293,24 +293,24 @@ static inline void hv_ivm_msr_write(u64 msr, u64 value) {} static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} #endif -static inline bool hv_is_synic_reg(unsigned int reg) +static inline bool hv_is_synic_msr(unsigned int reg) { - return (reg >= HV_REGISTER_SCONTROL) && - (reg <= HV_REGISTER_SINT15); + return (reg >= HV_X64_MSR_SCONTROL) && + (reg <= HV_X64_MSR_SINT15); } -static inline bool hv_is_sint_reg(unsigned int reg) +static inline bool hv_is_sint_msr(unsigned int reg) { - return (reg >= HV_REGISTER_SINT0) && - (reg <= HV_REGISTER_SINT15); + return (reg >= HV_X64_MSR_SINT0) && + (reg <= HV_X64_MSR_SINT15); } -u64 hv_get_register(unsigned int reg); -void hv_set_register(unsigned int reg, u64 value); -u64 hv_get_non_nested_register(unsigned int reg); -void hv_set_non_nested_register(unsigned int reg, u64 value); +u64 hv_get_msr(unsigned int reg); +void hv_set_msr(unsigned int reg, u64 value); +u64 hv_get_non_nested_msr(unsigned int reg); +void hv_set_non_nested_msr(unsigned int reg, u64 value); -static __always_inline u64 hv_raw_get_register(unsigned int reg) +static __always_inline u64 hv_raw_get_msr(unsigned int reg) { return __rdmsr(reg); } @@ -331,10 +331,10 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, { return -1; } -static inline void hv_set_register(unsigned int reg, u64 value) { } -static inline u64 hv_get_register(unsigned int reg) { return 0; } -static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { } -static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } +static inline void hv_set_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_msr(unsigned int reg) { return 0; } +static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 05956bd8bacf..e022e6eb766c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,10 +61,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -163,6 +166,14 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. @@ -185,11 +196,6 @@ * File. */ -#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* - * IA32_XAPIC_DISABLE_STATUS MSR - * supported - */ - #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* * Writeback and invalidate the diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fc3a8a3c7ffe..ff5f1ecc7d1e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -262,11 +262,20 @@ .Lskip_rsb_\@: .endm +/* + * The CALL to srso_alias_untrain_ret() must be patched in directly at + * the spot where untraining must be done, ie., srso_alias_untrain_ret() + * must be the target of a CALL instruction instead of indirectly + * jumping to a wrapper which then calls it. Therefore, this macro is + * called outside of __UNTRAIN_RET below, for the time being, before the + * kernel can support nested alternatives with arbitrary nesting. + */ +.macro CALL_UNTRAIN_RET #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) -#define CALL_UNTRAIN_RET "call entry_untrain_ret" -#else -#define CALL_UNTRAIN_RET "" + ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \ + "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS #endif +.endm /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the @@ -282,8 +291,8 @@ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns #if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ + CALL_UNTRAIN_RET + ALTERNATIVE_2 "", \ "call entry_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif @@ -317,6 +326,19 @@ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF .endm +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT +.endm +#else +#define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT +#endif + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -342,6 +364,8 @@ extern void retbleed_return_thunk(void); static inline void retbleed_return_thunk(void) {} #endif +extern void srso_alias_untrain_ret(void); + #ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); @@ -357,6 +381,10 @@ extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + extern void (*x86_return_thunk)(void); extern void __warn_thunk(void); diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9da9c8a2f1df..52f1b4ff0cc1 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -31,10 +31,12 @@ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC -#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ - CONFIG_PHYSICAL_ALIGN) +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR) #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3736b8a46c04..7f1e17250546 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -555,6 +555,7 @@ struct x86_pmu_lbr { unsigned int from; unsigned int to; unsigned int info; + bool has_callstack; }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 0b748ee16b3d..9abb8cc4cd47 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -148,7 +148,7 @@ #define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | \ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ - _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP) + _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP) #define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) @@ -173,6 +173,7 @@ enum page_cache_mode { }; #endif +#define _PAGE_CC (_AT(pteval_t, cc_mask)) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 811548f131f4..cb4f6c513c48 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -108,9 +108,23 @@ struct cpuinfo_topology { }; struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; + union { + /* + * The particular ordering (low-to-high) of (vendor, + * family, model) is done in case range of models, like + * it is usually done on AMD, need to be compared. + */ + struct { + __u8 x86_model; + /* CPU family */ + __u8 x86; + /* CPU vendor */ + __u8 x86_vendor; + __u8 x86_reserved; + }; + /* combined vendor, family, model */ + __u32 x86_vfm; + }; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ @@ -472,7 +486,6 @@ struct thread_struct { unsigned long iopl_emul; unsigned int iopl_warn:1; - unsigned int sig_on_uaccess_err:1; /* * Protection Keys Register for Userspace. Loaded immediately on @@ -587,7 +600,7 @@ extern char ignore_fpu_irq; # define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 %P1" +# define BASE_PREFETCH "prefetcht0 %1" #endif /* @@ -598,7 +611,7 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchnta %P1", + alternative_input(BASE_PREFETCH, "prefetchnta %1", X86_FEATURE_XMM, "m" (*(const char *)x)); } @@ -610,7 +623,7 @@ static inline void prefetch(const void *x) */ static __always_inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchw %P1", + alternative_input(BASE_PREFETCH, "prefetchw %1", X86_FEATURE_3DNOWPREFETCH, "m" (*(const char *)x)); } @@ -636,12 +649,10 @@ static __always_inline void prefetchw(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -extern unsigned long __end_init_task[]; +extern unsigned long __top_init_kernel_stack[]; #define INIT_THREAD { \ - .sp = (unsigned long)&__end_init_task - \ - TOP_OF_KERNEL_STACK_PADDING - \ - sizeof(struct pt_regs), \ + .sp = (unsigned long)&__top_init_kernel_stack, \ } extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index cde8357bb226..a053c1293975 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -85,6 +85,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { + int val; + if (!static_branch_likely(&virt_spin_lock_key)) return false; @@ -94,10 +96,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock) * horrible lock 'holder' preemption issues. */ - do { - while (atomic_read(&lock->val) != 0) - cpu_relax(); - } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } return true; } diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index ef9697f20129..0a985784be9b 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { - * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = _Q_LOCKED_VAL; * - * if (likely(lockval == _Q_LOCKED_VAL)) + * if (try_cmpxchg(&lock->locked, &lockval, 0)) * return; * pv_queued_spin_unlock_slowpath(lock, lockval); * } @@ -40,10 +40,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define PV_UNLOCK_ASM \ FRAME_BEGIN \ "push %rdx\n\t" \ - "mov $0x1,%eax\n\t" \ + "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ - "cmp $0x1,%al\n\t" \ "jne .slowpath\n\t" \ "pop %rdx\n\t" \ FRAME_END \ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index fef16e398161..42bcd42d70d1 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -9,7 +9,7 @@ #endif #ifdef CONFIG_COMPAT -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> #define __NR_seccomp_read_32 __NR_ia32_read #define __NR_seccomp_write_32 __NR_ia32_write #define __NR_seccomp_exit_32 __NR_ia32_exit diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9477b4053bce..93ed60080cfe 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -218,17 +218,16 @@ void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages); -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void __noreturn snp_abort(void); +void snp_dmi_setup(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); -void kdump_sev_callback(void); void sev_show_status(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } @@ -244,12 +243,12 @@ static inline void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } static inline void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } -static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } +static inline void snp_dmi_setup(void) { } static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { return -ENOTTY; @@ -258,7 +257,6 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } -static inline void kdump_sev_callback(void) { } static inline void sev_show_status(void) { } #endif @@ -270,6 +268,8 @@ int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); +void kdump_sev_callback(void); +void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } @@ -282,6 +282,8 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} +static inline void kdump_sev_callback(void) { } +static inline void snp_fixup_e820_tables(void) {} #endif #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2e9fc5c400cd..aec6e2d3aa1d 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -182,8 +182,8 @@ static __always_inline void clflush(volatile void *__p) static inline void clflushopt(volatile void *__p) { - alternative_io(".byte 0x3e; clflush %P0", - ".byte 0x66; clflush %P0", + alternative_io(".byte 0x3e; clflush %0", + ".byte 0x66; clflush %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } @@ -205,9 +205,9 @@ static inline void clwb(volatile void *__p) #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { - asm goto("1: wrussq %[val], (%[addr])\n" + asm goto("1: wrussq %[val], %[addr]\n" _ASM_EXTABLE(1b, %l[fail]) - :: [addr] "r" (addr), [val] "r" (val) + :: [addr] "m" (*addr), [val] "r" (val) :: fail); return 0; fail: diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 857d364b9888..9d0b324eab21 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -30,37 +30,40 @@ void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosw" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosw" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET32 static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosl" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosl" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET64 static inline void *memset64(uint64_t *s, uint64_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosq" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosq" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #endif diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index a800abb1a992..d8416b3bf832 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,11 +12,6 @@ /* image of the saved processor state */ struct saved_context { - /* - * On x86_32, all segment registers except gs are saved at kernel - * entry in pt_regs. - */ - u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; struct saved_msrs saved_msrs; @@ -27,6 +22,11 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + /* + * On x86_32, all segment registers except gs are saved at kernel + * entry in pt_regs. + */ + u16 gs; bool misc_enable_saved; } __attribute__((packed)); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f44e2f9ab65d..2fc7bc3863ff 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. @@ -127,6 +125,7 @@ static inline int syscall_get_arch(struct task_struct *task) } bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 237dc8cdd12b..0f9bab92a43d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -78,7 +78,7 @@ extern int __get_user_bad(void); int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - asm volatile("call __" #fn "_%P4" \ + asm volatile("call __" #fn "_%c4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ @@ -177,7 +177,7 @@ extern void __put_user_nocheck_8(void); __chk_user_ptr(__ptr); \ __ptr_pu = __ptr; \ __val_pu = __x; \ - asm volatile("call __" #fn "_%P[size]" \ + asm volatile("call __" #fn "_%c[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b89b40f250e6..6149eabe200f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -30,12 +30,13 @@ struct x86_init_mpparse { * @reserve_resources: reserve the standard resources for the * platform * @memory_setup: platform specific memory setup - * + * @dmi_setup: platform specific DMI setup */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); char *(*memory_setup)(void); + void (*dmi_setup)(void); }; /** diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a9088250770f..64fbd2dbc5b7 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,6 +62,11 @@ void xen_arch_unregister_cpu(int num); #ifdef CONFIG_PVH void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); +#ifdef CONFIG_XEN_PVH +void __init xen_reserve_extra_memory(struct boot_params *bootp); +#else +static inline void xen_reserve_extra_memory(struct boot_params *bootp) { } +#endif #endif /* Lazy mode for batching updates / context switch */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ad29984d5e39..ef11aa4cab42 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -694,6 +694,7 @@ enum sev_cmd_id { struct kvm_sev_cmd { __u32 id; + __u32 pad0; __u64 data; __u32 error; __u32 sev_fd; @@ -704,28 +705,35 @@ struct kvm_sev_launch_start { __u32 policy; __u64 dh_uaddr; __u32 dh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_launch_update_data { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_launch_secret { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_launch_measure { __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_guest_status { @@ -738,33 +746,43 @@ struct kvm_sev_dbg { __u64 src_uaddr; __u64 dst_uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_attestation_report { __u8 mnonce[16]; __u64 uaddr; __u32 len; + __u32 pad0; }; struct kvm_sev_send_start { __u32 policy; + __u32 pad0; __u64 pdh_cert_uaddr; __u32 pdh_cert_len; + __u32 pad1; __u64 plat_certs_uaddr; __u32 plat_certs_len; + __u32 pad2; __u64 amd_certs_uaddr; __u32 amd_certs_len; + __u32 pad3; __u64 session_uaddr; __u32 session_len; + __u32 pad4; }; struct kvm_sev_send_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; struct kvm_sev_receive_start { @@ -772,17 +790,22 @@ struct kvm_sev_receive_start { __u32 policy; __u64 pdh_uaddr; __u32 pdh_len; + __u32 pad0; __u64 session_uaddr; __u32 session_len; + __u32 pad1; }; struct kvm_sev_receive_update_data { __u64 hdr_uaddr; __u32 hdr_len; + __u32 pad0; __u64 guest_uaddr; __u32 guest_len; + __u32 pad1; __u64 trans_uaddr; __u32 trans_len; + __u32 pad2; }; #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6bc3456a8ebf..a1efa7907a0b 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data { __u32 token; __u8 pad[56]; - __u32 enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 74077694da7d..5d128167e2e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += resource.o diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 5bf5f9fc5753..3cf156f70859 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -95,6 +95,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, {} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a42d8a6f7149..0d22aefbde7f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -497,32 +497,32 @@ static struct clock_event_device lapic_clockevent = { static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), + X86_MATCH_VFM(INTEL_HASWELL, 0x22), + X86_MATCH_VFM(INTEL_HASWELL_L, 0x20), + X86_MATCH_VFM(INTEL_HASWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), + X86_MATCH_VFM(INTEL_BROADWELL, 0x25), + X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE, 0x52), {}, }; @@ -1687,11 +1687,11 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } @@ -1771,7 +1771,7 @@ void x2apic_setup(void) __x2apic_enable(); } -static __init void apic_set_fixmap(void); +static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { @@ -1793,7 +1793,12 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - apic_set_fixmap(); + /* + * Don't reread the APIC ID as it was already done from + * check_x2apic() and the APIC driver still is a x2APIC variant, + * which fails to do the read after x2APIC was disabled. + */ + apic_set_fixmap(false); } static __init void x2apic_enable(void) @@ -2057,13 +2062,14 @@ void __init init_apic_mappings(void) } } -static __init void apic_set_fixmap(void) +static __init void apic_set_fixmap(bool read_apic) { set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_mmio_base = APIC_BASE; apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr); - apic_read_boot_cpu_id(false); + if (read_apic) + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) @@ -2073,7 +2079,7 @@ void __init register_lapic_address(unsigned long address) mp_lapic_addr = address; if (!x2apic_mode) - apic_set_fixmap(); + apic_set_fixmap(true); } /* diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 567dbd2fe4b6..7db83212effb 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu) u32 phys_apicid = apic->cpu_present_to_apicid(cpu); u32 cluster = apic_cluster(phys_apicid); u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + int node = cpu_to_node(cpu); x86_cpu_to_logical_apicid[cpu] = logical_apicid; - if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) + if (alloc_clustermask(cpu, cluster, node) < 0) return -ENOMEM; - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + + if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node)) return -ENOMEM; + return 0; } diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 30335182b6b0..e92ff0c11db8 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -314,7 +314,7 @@ static bool is_callthunk(void *addr) return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; @@ -327,7 +327,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, tmpl_size, *pprog, + apply_relocation(insn_buff, tmpl_size, ip, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6d8677e80ddb..44df3f11e731 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -13,6 +13,7 @@ #include <asm/apic.h> #include <asm/cacheinfo.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/numa.h> @@ -345,6 +346,28 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } +static void bsp_determine_snp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; + + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined for. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + c->x86 >= 0x19 && snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } + } +#endif +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -437,8 +460,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x1a: switch (c->x86_model) { - case 0x00 ... 0x0f: - case 0x20 ... 0x2f: + case 0x00 ... 0x2f: case 0x40 ... 0x4f: case 0x70 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); @@ -452,21 +474,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; } - if (cpu_has(c, X86_FEATURE_SEV_SNP)) { - /* - * RMP table entry format is not architectural and it can vary by processor - * and is defined by the per-processor PPR. Restrict SNP support on the - * known CPU model and family for which the RMP table entry format is - * currently defined for. - */ - if (!boot_cpu_has(X86_FEATURE_ZEN3) && - !boot_cpu_has(X86_FEATURE_ZEN4) && - !boot_cpu_has(X86_FEATURE_ZEN5)) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - else if (!snp_probe_rmptable_info()) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - } - + bsp_determine_snp(c); return; warn: @@ -527,7 +535,6 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; if (c->x86 >= 0xf) @@ -595,20 +602,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); @@ -802,6 +795,11 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static const struct x86_cpu_desc erratum_1386_microcode[] = { + AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e), + AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052), +}; + static void fix_erratum_1386(struct cpuinfo_x86 *c) { /* @@ -811,7 +809,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) * * Affected parts all have no supervisor XSAVE states, meaning that * the XSAVEC instruction (which works fine) is equivalent. + * + * Clear the feature flag only on microcode revisions which + * don't have the fix. */ + if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode)) + return; + clear_cpu_cap(c, X86_FEATURE_XSAVES); } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index fdbb5f07448f..f9a8c7b7943f 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) +#define X86_MATCH(vfm) \ + X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), + X86_MATCH(INTEL_XEON_PHI_KNL), + X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { - X86_MATCH(SKYLAKE_X), + X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), + X86_MATCH(INTEL_ATOM_GOLDMONT), + X86_MATCH(INTEL_ATOM_GOLDMONT_D), + X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ba936d798b..b6f927f6c567 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,7 +26,7 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> @@ -61,6 +61,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); +static u64 __ro_after_init x86_arch_cap_msr; + static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; @@ -144,6 +146,8 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + x86_arch_cap_msr = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -301,8 +305,6 @@ static const char * const taa_strings[] = { static void __init taa_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -341,9 +343,8 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - ia32_cap = x86_read_arch_cap_msr(); - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* @@ -401,8 +402,6 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { @@ -413,8 +412,6 @@ static void __init mmio_select_mitigation(void) if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. @@ -437,7 +434,7 @@ static void __init mmio_select_mitigation(void) * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* @@ -447,10 +444,10 @@ static void __init mmio_select_mitigation(void) * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; @@ -508,7 +505,7 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; @@ -659,8 +656,6 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; @@ -669,8 +664,7 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -813,7 +807,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -1544,20 +1538,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + if (rrsba_disabled) + return; - if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; return; + } - ia32_cap = x86_read_arch_cap_msr(); + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; - if (ia32_cap & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) @@ -1607,6 +1606,74 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ dump_stack(); } +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_ON, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) + return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; + } + + if (spec_ctrl_bhi_dis()) + return; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Mitigate KVM by default */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n"); + + /* Mitigate syscalls when the mitigation is forced =on */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n"); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1718,6 +1785,9 @@ static void __init spectre_v2_select_mitigation(void) mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); + if (boot_cpu_has(X86_BUG_BHI)) + bhi_select_mitigation(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -1832,8 +1902,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1848,7 +1916,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -2323,20 +2391,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c) if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL: - case INTEL_FAM6_HASWELL_L: - case INTEL_FAM6_HASWELL_G: - case INTEL_FAM6_BROADWELL: - case INTEL_FAM6_BROADWELL_G: - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: + switch (c->x86_vfm) { + case INTEL_NEHALEM: + case INTEL_WESTMERE: + case INTEL_SANDYBRIDGE: + case INTEL_IVYBRIDGE: + case INTEL_HASWELL: + case INTEL_HASWELL_L: + case INTEL_HASWELL_G: + case INTEL_BROADWELL: + case INTEL_BROADWELL_G: + case INTEL_SKYLAKE_L: + case INTEL_SKYLAKE: + case INTEL_KABYLAKE_L: + case INTEL_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -2695,15 +2763,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2712,10 +2780,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2725,14 +2793,32 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } +static const char *spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && + rrsba_disabled) + return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; + + return "; BHI: Vulnerable"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2745,13 +2831,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ba8cf5e9ce56..cdaa795a9371 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -114,17 +114,17 @@ static const struct x86_cpu_id ppin_cpuids[] = { X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), /* Legacy models without CPUID enumeration */ - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), {} }; @@ -1053,18 +1053,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - bool vp_bits_from_cpuid = true; if (!cpu_has(c, X86_FEATURE_CPUID) || - (c->extended_cpuid_level < 0x80000008)) - vp_bits_from_cpuid = false; - - if (vp_bits_from_cpuid) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } else { + (c->extended_cpuid_level < 0x80000008)) { if (IS_ENABLED(CONFIG_X86_64)) { c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1078,7 +1069,13 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; } + } else { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; } + c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_alignment = c->x86_clflush_size; } @@ -1120,12 +1117,13 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) -#define VULNWL_INTEL(model, whitelist) \ - VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) +#define VULNWL_INTEL(vfm, whitelist) \ + X86_MATCH_VFM(vfm, whitelist) #define VULNWL_AMD(family, whitelist) \ VULNWL(AMD, family, X86_MODEL_ANY, whitelist) @@ -1142,32 +1140,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(TIGERLAKE, NO_MMIO), - VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), - VULNWL_INTEL(ALDERLAKE, NO_MMIO), - VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO), - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(CORE_YONAH, NO_SSB), + VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1177,33 +1175,31 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; #define VULNBL(vendor, family, model, blacklist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) -#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, steppings, \ - X86_FEATURE_ANY, issues) +#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \ + X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues) #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1228,43 +1224,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RFDS BIT(7) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), @@ -1283,25 +1279,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); } -static bool __init vulnerable_to_rfds(u64 ia32_cap) +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) { /* The "immunity" bit trumps everything else: */ - if (ia32_cap & ARCH_CAP_RFDS_NO) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) return false; /* @@ -1309,7 +1305,7 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) * indicate that mitigation is needed because guest is running on a * vulnerable hardware or may migrate to such hardware: */ - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) return true; /* Only consult the blacklist when there is no enumeration: */ @@ -1318,11 +1314,11 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1334,7 +1330,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1345,17 +1341,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Don't use AutoIBRS when SNP is enabled because it degrades host * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || (cpu_has(c, X86_FEATURE_AUTOIBRS) && !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1374,9 +1370,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1402,7 +1398,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) @@ -1410,7 +1406,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1428,18 +1424,25 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); - if (vulnerable_to_rfds(ia32_cap)) + if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); + /* When virtualized, eIBRS could be hidden, assume vulnerable */ + if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && + !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); @@ -2307,6 +2310,8 @@ void arch_smt_update(void) void __init arch_cpu_finalize_init(void) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + identify_boot_cpu(); select_idle_routine(); @@ -2345,6 +2350,13 @@ void __init arch_cpu_finalize_init(void) fpu__init_system(); fpu__init_cpu(); + /* + * Ensure that access to the per CPU representation has the initial + * boot CPU configuration. + */ + *c = boot_cpu_data; + c->initialized = true; + alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b7174209d855..946813d816bf 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be30d7fa2e66..3c3e7e5695ba 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -228,6 +228,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c) if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); mktme_status = MKTME_DISABLED; + clear_cpu_cap(c, X86_FEATURE_TME); return; } diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index f18d35fe27a9..30b1d63b97f3 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu) } static const struct x86_cpu_id intel_epb_normal[] = { - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), {} }; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index ad6776081e60..8651643bddae 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -17,8 +17,7 @@ * * A typical table entry would be to match a specific CPU * - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, - * X86_FEATURE_ANY, NULL); + * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) @@ -26,7 +25,7 @@ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts * for various common selections. The above can be shortened to: * - * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * X86_MATCH_VFM(INTEL_BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5cc557cfc37..ad0623b659ed 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -47,7 +47,7 @@ #include <linux/kexec.h> #include <asm/fred.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -1593,6 +1593,24 @@ noinstr void do_machine_check(struct pt_regs *regs) else queue_task_work(&m, msg, kill_me_maybe); + } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(&m)) { + struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from @@ -1930,14 +1948,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) cfg->bootlog = 0; - if (c->x86 == 6 && c->x86_model == 45) + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) mce_flags.snb_ifu_quirk = 1; /* * Skylake, Cascacde Lake and Cooper Lake require a quirk on * rep movs. */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) + if (c->x86_vfm == INTEL_SKYLAKE_X) mce_flags.skx_repmov_quirk = 1; } @@ -2500,12 +2518,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 399b62e223d2..f6103e6bf69a 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -13,7 +13,7 @@ #include <linux/cpumask.h> #include <asm/apic.h> #include <asm/cpufeature.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -455,10 +455,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) { u64 error_control; - switch (c->x86_model) { - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: + switch (c->x86_vfm) { + case INTEL_SANDYBRIDGE_X: + case INTEL_IVYBRIDGE_X: + case INTEL_HASWELL_X: if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; @@ -484,12 +484,11 @@ bool intel_filter_mce(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ - if ((c->x86 == 6) && - ((c->x86_model == INTEL_FAM6_HASWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_L) || - (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G) || - (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + if ((c->x86_vfm == INTEL_HASWELL || + c->x86_vfm == INTEL_HASWELL_L || + c->x86_vfm == INTEL_BROADWELL || + c->x86_vfm == INTEL_HASWELL_G || + c->x86_vfm == INTEL_SKYLAKE_X) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index c4477162c07d..dac4d64dfb2a 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -12,7 +12,7 @@ #include <linux/uaccess.h> #include <asm/mce.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/traps.h> #include <asm/insn.h> #include <asm/insn-eval.h> @@ -39,20 +39,20 @@ static struct severity { u64 mask; u64 result; unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; + unsigned short mcgmask; + unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; unsigned char covered; - unsigned char cpu_model; + unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h -#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -128,7 +128,7 @@ static struct severity { MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), - MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ @@ -174,6 +174,18 @@ static struct severity { USER ), MCESEV( + AR, "Data load error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( + AR, "Instruction fetch error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL @@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) switch (fixup_type) { case EX_TYPE_UACCESS: - case EX_TYPE_COPY: if (!copy_user) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; @@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char continue; if (s->excp && excp != s->excp) continue; - if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5f0414452b67..815fa67356a2 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -21,7 +21,7 @@ #include <linux/uio.h> #include <linux/mm.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -577,8 +577,7 @@ static bool is_blacklisted(unsigned int cpu) * This behavior is documented in item BDF90, #334165 (Intel Xeon * Processor E7-8800/4800 v4 Product Family). */ - if (c->x86 == 6 && - c->x86_model == INTEL_FAM6_BROADWELL_X && + if (c->x86_vfm == INTEL_BROADWELL_X && c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 303fef824167..e0fd57a8ba84 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -45,70 +45,70 @@ bool hyperv_paravisor_present __ro_after_init; EXPORT_SYMBOL_GPL(hyperv_paravisor_present); #if IS_ENABLED(CONFIG_HYPERV) -static inline unsigned int hv_get_nested_reg(unsigned int reg) +static inline unsigned int hv_get_nested_msr(unsigned int reg) { - if (hv_is_sint_reg(reg)) - return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + if (hv_is_sint_msr(reg)) + return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0; switch (reg) { - case HV_REGISTER_SIMP: - return HV_REGISTER_NESTED_SIMP; - case HV_REGISTER_SIEFP: - return HV_REGISTER_NESTED_SIEFP; - case HV_REGISTER_SVERSION: - return HV_REGISTER_NESTED_SVERSION; - case HV_REGISTER_SCONTROL: - return HV_REGISTER_NESTED_SCONTROL; - case HV_REGISTER_EOM: - return HV_REGISTER_NESTED_EOM; + case HV_X64_MSR_SIMP: + return HV_X64_MSR_NESTED_SIMP; + case HV_X64_MSR_SIEFP: + return HV_X64_MSR_NESTED_SIEFP; + case HV_X64_MSR_SVERSION: + return HV_X64_MSR_NESTED_SVERSION; + case HV_X64_MSR_SCONTROL: + return HV_X64_MSR_NESTED_SCONTROL; + case HV_X64_MSR_EOM: + return HV_X64_MSR_NESTED_EOM; default: return reg; } } -u64 hv_get_non_nested_register(unsigned int reg) +u64 hv_get_non_nested_msr(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; } -EXPORT_SYMBOL_GPL(hv_get_non_nested_register); +EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); -void hv_set_non_nested_register(unsigned int reg, u64 value) +void hv_set_non_nested_msr(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_reg(reg)) + if (hv_is_sint_msr(reg)) wrmsrl(reg, value | 1 << 20); } else { wrmsrl(reg, value); } } -EXPORT_SYMBOL_GPL(hv_set_non_nested_register); +EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); -u64 hv_get_register(unsigned int reg) +u64 hv_get_msr(unsigned int reg) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - return hv_get_non_nested_register(reg); + return hv_get_non_nested_msr(reg); } -EXPORT_SYMBOL_GPL(hv_get_register); +EXPORT_SYMBOL_GPL(hv_get_msr); -void hv_set_register(unsigned int reg, u64 value) +void hv_set_msr(unsigned int reg, u64 value) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - hv_set_non_nested_register(reg, value); + hv_set_non_nested_msr(reg, value); } -EXPORT_SYMBOL_GPL(hv_set_register); +EXPORT_SYMBOL_GPL(hv_set_msr); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); @@ -352,13 +352,24 @@ static void __init reduced_hw_init(void) x86_init.irqs.pre_vector_init = x86_init_noop; } +int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) +{ + unsigned int hv_max_functions; + + hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + if (hv_max_functions < HYPERV_CPUID_VERSION) { + pr_err("%s: Could not detect Hyper-V version\n", __func__); + return -ENODEV; + } + + cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx); + + return 0; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; - int hv_host_info_eax; - int hv_host_info_ebx; - int hv_host_info_ecx; - int hv_host_info_edx; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -409,21 +420,6 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running on a nested hypervisor\n"); } - /* - * Extract host information. - */ - if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { - hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); - hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); - hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); - hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); - - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", - hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, - hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, - hv_host_info_ecx, hv_host_info_edx >> 24); - } - if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; @@ -456,7 +452,7 @@ static void __init ms_hyperv_init_platform(void) /* To be supported: more work is required. */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; - /* HV_REGISTER_CRASH_CTL is unsupported. */ + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; /* Don't trust Hyper-V's TLB-flushing hypercalls. */ @@ -648,6 +644,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, + .init.guest_late_init = ms_hyperv_late_init, #ifdef CONFIG_AMD_MEM_ENCRYPT .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 422a4ddc2ab7..7b29ebda024f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,7 +108,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 83e40341583e..19b4fdb94a36 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include <linux/cacheinfo.h> #include <linux/cpuhotplug.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/resctrl.h> #include "internal.h" @@ -821,18 +821,18 @@ static __init bool get_rdt_mon_resources(void) static __init void __check_quirks_intel(void) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_HASWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_HASWELL_X: if (!rdt_options[RDT_FLAG_L3_CAT].force_off) cache_alloc_hsw_probe(); break; - case INTEL_FAM6_SKYLAKE_X: + case INTEL_SKYLAKE_X: if (boot_cpu_data.x86_stepping <= 4) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); fallthrough; - case INTEL_FAM6_BROADWELL_X: + case INTEL_BROADWELL_X: intel_rdt_mbm_apply_quirk(); break; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c99f26ebe7a6..1a8687f8073a 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -78,7 +78,8 @@ cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) else cpu = cpumask_any_but(mask, exclude_cpu); - if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) return cpu; /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 884b88e25141..04584a76ceb4 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -23,7 +23,7 @@ #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/resctrl.h> #include <asm/perf_event.h> @@ -88,8 +88,8 @@ static u64 get_prefetch_disable_bits(void) boot_cpu_data.x86 != 6) return 0; - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -100,8 +100,8 @@ static u64 get_prefetch_disable_bits(void) * 63:4 Reserved */ return 0xF; - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -1084,9 +1084,9 @@ static int measure_l2_residency(void *_plr) * L2_HIT 02H * L2_MISS 10H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: perf_miss_attr.config = X86_CONFIG(.event = 0xd1, .umask = 0x10); perf_hit_attr.config = X86_CONFIG(.event = 0xd1, @@ -1123,8 +1123,8 @@ static int measure_l3_residency(void *_plr) * MISS 41H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* On BDW the hit event counts references, not hits */ perf_hit_attr.config = X86_CONFIG(.event = 0x2e, .umask = 0x4f); @@ -1142,7 +1142,7 @@ static int measure_l3_residency(void *_plr) */ counts.miss_after -= counts.miss_before; - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { /* * On BDW references and misses are counted, need to adjust. * Sometimes the "hits" counter is a bit more than the diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 0dad49a09b7a..af5aa2c754c2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, @@ -49,6 +50,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 3259b1d4fefe..d17c9b71eb4a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -123,7 +123,6 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -210,7 +209,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) topo_info.nr_disabled_cpus++; } - /* Register present and possible CPUs in the domain maps */ + /* + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. + */ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); } @@ -415,6 +418,17 @@ void __init topology_init_possible_cpus(void) unsigned int total = assigned + disabled; u32 apicid, firstid; + /* + * If there was no APIC registered, then fake one so that the + * topology bitmap is populated. That ensures that the code below + * is valid and the various query interfaces can be used + * unconditionally. This does not affect the actual APIC code in + * any way because either the local APIC address has not been + * registered or the local APIC was disabled on the command line. + */ + if (topo_info.boot_cpu_apic_id == BAD_APICID) + topology_register_boot_apic(0); + if (!restrict_to_up()) { if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { disabled += assigned - nr_cpu_ids; diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 1a8b3ad493af..d419deed6a48 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -29,11 +29,21 @@ static bool parse_8000_0008(struct topo_scan *tscan) if (!sft) sft = get_count_order(ecx.cpu_nthreads + 1); - topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); return true; } -static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) { /* * Starting with Fam 17h the DIE domain could probably be used to @@ -48,7 +58,7 @@ static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_ tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) +static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) { struct { // eax @@ -73,12 +83,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* - * If leaf 0xb is available, then SMT shift is set already. If not - * take it from ecx.threads_per_core and use topo_update_dom() - - * topology_set_dom() would propagate and overwrite the already - * propagated CORE level. + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. */ - if (!has_0xb) { + if (!has_topoext) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ unsigned int nthreads = leaf.core_nthreads + 1; topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); @@ -107,64 +119,86 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) return true; } -static bool parse_fam10h_node_id(struct topo_scan *tscan) +static void parse_fam10h_node_id(struct topo_scan *tscan) { - struct { - union { + union { + struct { u64 node_id : 3, nodes_per_pkg : 3, unused : 58; - u64 msr; }; + u64 msr; } nid; if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) - return false; + return; rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; - return true; } static void legacy_set_llc(struct topo_scan *tscan) { unsigned int apicid = tscan->c->topo.initial_apicid; - /* parse_8000_0008() set everything up except llc_id */ - tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; + /* If none of the parsers set LLC ID then use the die ID for it. */ + if (tscan->c->topo.llc_id == BAD_APICID) + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } } static void parse_topology_amd(struct topo_scan *tscan) { - bool has_0xb = false; + bool has_topoext = false; /* * If the extended topology leaf 0x8000_001e is available - * try to get SMT and CORE shift from leaf 0xb first, then - * try to get the CORE shift from leaf 0x8000_0008. + * try to get SMT, CORE, TILE, and DIE shifts from extended + * CPUID leaf 0x8000_0026 on supported processors first. If + * extended CPUID leaf 0x8000_0026 is not supported, try to + * get SMT and CORE shift from leaf 0xb first, then try to + * get the CORE shift from leaf 0x8000_0008. */ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_0xb = cpu_parse_topology_ext(tscan); + has_topoext = cpu_parse_topology_ext(tscan); - if (!has_0xb && !parse_8000_0008(tscan)) + if (!has_topoext && !parse_8000_0008(tscan)) return; /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_0xb)) + if (parse_8000_001e(tscan, has_topoext)) return; /* Try the NODEID MSR */ - if (parse_fam10h_node_id(tscan)) - return; - - legacy_set_llc(tscan); + parse_fam10h_node_id(tscan); } void cpu_parse_topology_amd(struct topo_scan *tscan) { tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); parse_topology_amd(tscan); + legacy_set_llc(tscan); if (tscan->amd_nodes_per_pkg > 1) set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index a50ae8d63d1c..9a6069e7133c 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -140,7 +140,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) } } -static void topo_set_ids(struct topo_scan *tscan) +static void topo_set_ids(struct topo_scan *tscan, bool early) { struct cpuinfo_x86 *c = tscan->c; u32 apicid = c->topo.apicid; @@ -148,8 +148,10 @@ static void topo_set_ids(struct topo_scan *tscan) c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); - c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); - c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + } /* Package relative core ID */ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> @@ -187,7 +189,7 @@ void cpu_parse_topology(struct cpuinfo_x86 *c) tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); } - topo_set_ids(&tscan); + topo_set_ids(&tscan, false); } void __init cpu_init_topology(struct cpuinfo_x86 *c) @@ -208,7 +210,7 @@ void __init cpu_init_topology(struct cpuinfo_x86 *c) x86_topo_system.dom_size[dom] = 1U << sft; } - topo_set_ids(&tscan); + topo_set_ids(&tscan, true); /* * AMD systems have Nodes per package which cannot be mapped to diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c index e477228cd5b2..467b0326bf1a 100644 --- a/arch/x86/kernel/cpu/topology_ext.c +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -13,7 +13,10 @@ enum topo_types { CORE_TYPE = 2, MAX_TYPE_0B = 3, MODULE_TYPE = 3, + AMD_CCD_TYPE = 3, TILE_TYPE = 4, + AMD_SOCKET_TYPE = 4, + MAX_TYPE_80000026 = 5, DIE_TYPE = 5, DIEGRP_TYPE = 6, MAX_TYPE_1F = 7, @@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, }; +static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN, + [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN, +}; + static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, unsigned int *last_dom) { @@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, switch (leaf) { case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break; default: return false; } @@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan) if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) return true; + /* AMD: Try leaf 0x80000026 first. */ + if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026)) + return true; + /* Intel/AMD: Fall back to leaf 0xB if available */ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 44a91ef5a23b..a7d562697e50 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -405,8 +405,8 @@ static void __die_header(const char *str, struct pt_regs *regs, long err) pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - pr, + "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, + ++die_counter, pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b66f540de054..68b09f718f10 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -1016,17 +1017,6 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - /* - * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by - * kexec and do not need to be reserved. - */ - if (data->type != SETUP_EFI && - data->type != SETUP_IMA && - data->type != SETUP_RNG_SEED) - e820__range_update_kexec(pa_data, - sizeof(*data) + data->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT) { len += data->len; early_memunmap(data, sizeof(*data)); @@ -1038,12 +1028,9 @@ void __init e820__reserve_setup_data(void) indirect = (struct setup_indirect *)data->data; - if (indirect->type != SETUP_INDIRECT) { + if (indirect->type != SETUP_INDIRECT) e820__range_update(indirect->addr, indirect->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } } pa_data = pa_next; @@ -1051,7 +1038,6 @@ void __init e820__reserve_setup_data(void) } e820__update_table(e820_table); - e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index e963344b0449..53935b4d62e3 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -2,6 +2,7 @@ /* * EISA specific code */ +#include <linux/cc_platform.h> #include <linux/ioport.h> #include <linux/eisa.h> #include <linux/io.h> @@ -12,7 +13,7 @@ static __init int eisa_bus_probe(void) { void __iomem *p; - if (xen_pv_domain() && !xen_initial_domain()) + if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return 0; p = ioremap(0x0FFFD9, 4); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d978251496b8..6276329f5e66 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -178,10 +178,11 @@ void fpu__init_cpu_xstate(void) * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures - * that any stale state is wiped out from XFD. + * that any stale state is wiped out from XFD. Reset the per CPU + * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) - wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 3518fb26d06b..19ca623ffa2a 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -148,20 +148,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #endif #ifdef CONFIG_X86_64 +static inline void xfd_set_state(u64 xfd) +{ + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); +} + static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; - if (__this_cpu_read(xfd_state) != xfd) { - wrmsrl(MSR_IA32_XFD, xfd); - __this_cpu_write(xfd_state, xfd); - } + if (__this_cpu_read(xfd_state) != xfd) + xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else +static inline void xfd_set_state(u64 xfd) { } + static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 212e8e06aeba..a817ed0724d1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -81,6 +81,13 @@ static inline bool check_la57_support(void) if (!(native_read_cr4() & X86_CR4_LA57)) return false; + RIP_REL_REF(__pgtable_l5_enabled) = 1; + RIP_REL_REF(pgdir_shift) = 48; + RIP_REL_REF(ptrs_per_p4d) = 512; + RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; + RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; + RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; + return true; } @@ -175,7 +182,7 @@ unsigned long __head __startup_64(unsigned long physaddr, p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC; + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; } RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; @@ -431,15 +438,6 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); - if (check_la57_support()) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - page_offset_base = __PAGE_OFFSET_BASE_L5; - vmalloc_base = __VMALLOC_BASE_L5; - vmemmap_base = __VMEMMAP_BASE_L5; - } - cr4_init_shadow(); /* Kill off the identity-map trampoline */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b50f3641c4d6..2e42056d2306 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -44,9 +44,6 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id - -#define SIZEOF_PTREGS 17*4 - /* * Worst-case size of the kernel mapping we need to make: * a relocatable kernel can live anywhere in lowmem, so we need to be able @@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table) .data .balign 4 -/* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder - * reliably detect the end of the stack. - */ -SYM_DATA(initial_stack, - .long init_thread_union + THREAD_SIZE - - SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) +SYM_DATA(initial_stack, .long __top_init_kernel_stack) __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" /* * The IDT and GDT 'descriptors' are a strange 48-bit object diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8198fbd70e5..330922b328bf 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,7 +66,7 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp + leaq __top_init_kernel_stack(%rip), %rsp /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx @@ -720,7 +720,7 @@ SYM_DATA(smpboot_control, .long 0) SYM_DATA(phys_base, .quad 0x0) EXPORT_SYMBOL(phys_base) -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" __PAGE_ALIGNED_BSS SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 091b3ab76a18..d0e49bd7c6f3 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,7 +373,16 @@ out: kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - if (is_endbr(*(u32 *)addr)) { + u32 insn; + + /* + * Since 'addr' is not guaranteed to be safe to access, use + * copy_from_kernel_nofault() to read the instruction: + */ + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) + return NULL; + + if (is_endbr(insn)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4cadfd606e8e..263f8aed4e2c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -44,7 +44,7 @@ #include <asm/svm.h> #include <asm/e820/api.h> -DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); +DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled); static int kvmapf = 1; @@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); @@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 1ccd30c8246f..e89171b0347a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -197,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; - /* Initialize the lapic mapping */ - if (!acpi_lapic) - register_lapic_address(mpc->lapic); - - if (early) + if (early) { + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); return 1; + } /* Now process the configuration blocks. */ while (count < mpc->length) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a5b372c706f..ed163c8c8604 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -580,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); static char *nmi_check_stall_msg[] = { /* */ -/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */ /* | +------ cpu_is_offline(cpu) */ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ /* | | | NMI handler has been invoked. */ @@ -628,22 +628,26 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) nmi_seq = READ_ONCE(nsp->idt_nmi_seq); if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { msgp = "CPU entered NMI handler function, but has not exited"; - } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { - msgp = "CPU is handling NMIs"; - } else { - idx = ((nsp->idt_seq_snap & 0x1) << 2) | + } else if (nsp->idt_nmi_seq_snap == nmi_seq || + nsp->idt_nmi_seq_snap + 1 == nmi_seq) { + idx = ((nmi_seq & 0x1) << 2) | (cpu_is_offline(cpu) << 1) | (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); msgp = nmi_check_stall_msg[idx]; if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) modp = ", but OK because ignore_nmis was set"; - if (nmi_seq & 0x1) - msghp = " (CPU currently in NMI handler function)"; - else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) msghp = " (CPU exited one NMI handler function)"; + else if (nmi_seq & 0x1) + msghp = " (CPU currently in NMI handler function)"; + else + msghp = " (CPU was never in an NMI handler function)"; + } else { + msgp = "CPU is handling NMIs"; } - pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", - __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp); + pr_alert("%s: last activity: %lu jiffies ago.\n", + __func__, j - READ_ONCE(nsp->recv_jiffies)); } } diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 319fef37d9dc..cc2c34ba7228 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -203,16 +203,6 @@ void __init probe_roms(void) unsigned char c; int i; - /* - * The ROM memory range is not part of the e820 table and is therefore not - * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted - * memory, and SNP requires encrypted memory to be validated before access. - * Do that here. - */ - snp_prep_memory(video_rom_resource.start, - ((system_rom_resource.end + 1) - video_rom_resource.start), - SNP_PAGE_STATE_PRIVATE); - /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7062b84dd467..6d3d20e3e43a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -139,7 +139,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, log_lvl, d3, d6, d7); } - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) + if (cr4 & X86_CR4_PKE) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 2e7066980f3e..51a849a79c98 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -10,7 +10,6 @@ #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> -#include <asm/intel-mid.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3e1e96efadfe..55a1fc332e20 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -7,9 +7,9 @@ */ #include <linux/acpi.h> #include <linux/console.h> +#include <linux/cpu.h> #include <linux/crash_dump.h> #include <linux/dma-map-ops.h> -#include <linux/dmi.h> #include <linux/efi.h> #include <linux/ima.h> #include <linux/init_ohci1394_dma.h> @@ -36,6 +36,7 @@ #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cacheinfo.h> +#include <asm/coco.h> #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -753,6 +754,22 @@ void __init setup_arch(char **cmdline_p) boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + /* * If we have OLPC OFW, we might end up relocating the fixmap due to * reserve_top(), so do this before touching the ioremap area. @@ -832,22 +849,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -902,7 +903,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); reserve_ibft_region(); - dmi_setup(); + x86_init.resources.dmi_setup(); /* * VMware detection requires dmi to be available, so this @@ -992,6 +993,7 @@ void __init setup_arch(char **cmdline_p) * memory size. */ mem_encrypt_setup_arch(); + cc_random_init(); efi_fake_memmap(); efi_find_mirror(); @@ -1206,16 +1208,6 @@ void __init i386_reserve_resources(void) #endif /* CONFIG_X86_32 */ -#ifndef CONFIG_SMP -void __init smp_prepare_boot_cpu(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - *c = boot_cpu_data; - c->initialized = true; -} -#endif - static struct notifier_block kernel_offset_notifier = { .notifier_call = dump_kernel_offset }; @@ -1227,3 +1219,10 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +#ifdef CONFIG_HOTPLUG_CPU +bool arch_cpu_is_hotpluggable(int cpu) +{ + return cpu > 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 8b04958da5e7..b4f8fa0f722c 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -1203,12 +1203,14 @@ static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, break; case SVM_EXIT_MONITOR: - if (opcode == 0x010f && modrm == 0xc8) + /* MONITOR and MONITORX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa)) return ES_OK; break; case SVM_EXIT_MWAIT: - if (opcode == 0x010f && modrm == 0xc9) + /* MWAIT and MWAITX instructions generate the same error code */ + if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb)) return ES_OK; break; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b59b09c2f284..995f94467101 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -23,6 +23,7 @@ #include <linux/platform_device.h> #include <linux/io.h> #include <linux/psp-sev.h> +#include <linux/dmi.h> #include <uapi/linux/sev-guest.h> #include <asm/init.h> @@ -795,21 +796,6 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); } -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) -{ - unsigned long vaddr, npages; - - vaddr = (unsigned long)__va(paddr); - npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; - - if (op == SNP_PAGE_STATE_PRIVATE) - early_snp_set_memory_private(vaddr, paddr, npages); - else if (op == SNP_PAGE_STATE_SHARED) - early_snp_set_memory_shared(vaddr, paddr, npages); - else - WARN(1, "invalid memory op %d\n", op); -} - static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, unsigned long vaddr_end, int op) { @@ -952,7 +938,7 @@ static int snp_set_vmsa(void *va, bool vmsa) #define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) #define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) -static void *snp_alloc_vmsa_page(void) +static void *snp_alloc_vmsa_page(int cpu) { struct page *p; @@ -964,7 +950,7 @@ static void *snp_alloc_vmsa_page(void) * * Allocate an 8k page which is also 8k-aligned. */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); if (!p) return NULL; @@ -1033,7 +1019,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) * #VMEXIT of that vCPU would wipe out all of the settings being done * here. */ - vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu); if (!vmsa) return -ENOMEM; @@ -1355,7 +1341,7 @@ static void __init alloc_runtime_data(int cpu) { struct sev_es_runtime_data *data; - data = memblock_alloc(sizeof(*data), PAGE_SIZE); + data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu)); if (!data) panic("Can't allocate SEV-ES runtime data"); @@ -2136,6 +2122,17 @@ void __head __noreturn snp_abort(void) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); } +/* + * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are + * enabled, as the alternative (fallback) logic for DMI probing in the legacy + * ROM region can cause a crash since this region is not pre-validated. + */ +void __init snp_dmi_setup(void) +{ + if (efi_enabled(EFI_CONFIG_TABLES)) + dmi_setup(); +} + static void dump_cpuid_table(void) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -2287,16 +2284,6 @@ static int __init snp_init_platform_device(void) } device_initcall(snp_init_platform_device); -void kdump_sev_callback(void) -{ - /* - * Do wbinvd() on remote CPUs when SNP is enabled in order to - * safely do SNP_SHUTDOWN on the local CPU. - */ - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - wbinvd(); -} - void sev_show_status(void) { int i; diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index c12624bc82a3..ef654530bf5a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -34,7 +34,7 @@ #include <asm/gsseg.h> #ifdef CONFIG_IA32_EMULATION -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> static inline void reload_segments(struct sigcontext_32 *sc) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe355c89f6c1..0c35207320cb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -313,14 +313,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -static void __init smp_store_boot_cpu_info(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - *c = boot_cpu_data; - c->initialized = true; -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -446,9 +438,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id intel_cod_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ + X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */ {} }; @@ -1039,36 +1031,24 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_die_cpumask(0)); } -static void __init smp_cpu_index_default(void) -{ - int i; - struct cpuinfo_x86 *c; - - for_each_possible_cpu(i) { - c = &cpu_data(i); - /* mark all to hotplug */ - c->cpu_index = nr_cpu_ids; - } -} - void __init smp_prepare_cpus_common(void) { - unsigned int i; + unsigned int cpu, node; - smp_cpu_index_default(); + /* Mark all except the boot CPU as hotpluggable */ + for_each_possible_cpu(cpu) { + if (cpu) + per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids; + } - /* - * Setup boot CPU information - */ - smp_store_boot_cpu_info(); /* Final full version of the data */ - mb(); + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node); } set_cpu_sibling_map(0); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c deleted file mode 100644 index d42c28b8bfd8..000000000000 --- a/arch/x86/kernel/topology.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Populate sysfs with topology information - * - * Written by: Matthew Dobson, IBM Corporation - * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - */ -#include <linux/interrupt.h> -#include <linux/nodemask.h> -#include <linux/export.h> -#include <linux/mmzone.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <asm/io_apic.h> -#include <asm/cpu.h> - -#ifdef CONFIG_HOTPLUG_CPU -bool arch_cpu_is_hotpluggable(int cpu) -{ - return cpu > 0; -} -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5a69a49acc96..06b170759e5b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -26,7 +26,7 @@ #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/i8259.h> #include <asm/uv/uv.h> @@ -44,7 +44,7 @@ EXPORT_SYMBOL(tsc_khz); static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; -static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; @@ -682,7 +682,7 @@ unsigned long native_calibrate_tsc(void) * clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) + boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* @@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void) * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 6555a857a1e6..deeb02825670 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 56451fd2099e..3509afc6a672 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -15,11 +15,7 @@ * put it inside the section definition. */ -#ifdef CONFIG_X86_32 -#define LOAD_OFFSET __PAGE_OFFSET -#else #define LOAD_OFFSET __START_KERNEL_map -#endif #define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE @@ -114,11 +110,10 @@ PHDRS { SECTIONS { + . = __START_KERNEL; #ifdef CONFIG_X86_32 - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); #else - . = __START_KERNEL; phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); #endif @@ -172,6 +167,9 @@ SECTIONS /* init_task */ INIT_TASK_DATA(THREAD_SIZE) + /* equivalent to task_pt_regs(&init_task) */ + __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE; + #ifdef CONFIG_X86_32 /* 32 bit has nosave before _edata */ NOSAVE_DATA diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a42830dc151b..d5dc5a92635a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -3,6 +3,7 @@ * * For licencing details see kernel-base/COPYING */ +#include <linux/dmi.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/export.h> @@ -66,6 +67,7 @@ struct x86_init_ops x86_init __initdata = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = e820__memory_setup_default, + .dmi_setup = dmi_setup, }, .mpparse = { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8c3032a96caf..0ebdd088f28b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -118,10 +118,11 @@ config KVM_AMD will be called kvm-amd. config KVM_AMD_SEV - def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" + default y depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) + select ARCH_HAS_CC_PLATFORM help Provides support for launching Encrypted VMs (SEV) and Encrypted VMs with Encrypted State (SEV-ES) on AMD processors. diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 475b5fa917a6..addc44fc7187 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,10 +3,6 @@ ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror -ifeq ($(CONFIG_FRAME_POINTER),y) -OBJECT_FILES_NON_STANDARD_vmenter.o := y -endif - include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index adba49afb5fe..77352a4abd87 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -189,15 +189,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 return 0; } -static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, - const char *sig) +static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries, + int nent, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_hypervisor_cpuid_base(base) { - entry = kvm_find_cpuid_entry(vcpu, base); + entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (entry) { u32 signature[3]; @@ -217,22 +217,29 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp return cpuid; } -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, int nent) +static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, + const char *sig) { - u32 base = vcpu->arch.kvm_cpuid.base; - - if (!base) - return NULL; + return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, sig); +} - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries, + int nent, u32 kvm_cpuid_base) +{ + return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { - return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent); + u32 base = vcpu->arch.kvm_cpuid.base; + + if (!base) + return NULL; + + return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent, base); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -266,6 +273,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e int nent) { struct kvm_cpuid_entry2 *best; + struct kvm_hypervisor_cpuid kvm_cpuid; best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { @@ -292,10 +300,12 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE); + if (kvm_cpuid.base) { + best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base); + if (kvm_hlt_in_guest(vcpu->kvm) && best) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + } if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); @@ -366,6 +376,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_update_pv_runtime(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 856e3037e74f..23dbb9eb277c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -120,6 +120,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf37586f0466..ebf41023be38 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2776,7 +2776,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e8..db007a4dffa2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4935,7 +4935,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->cpu_role.base.level, is_efer_nx(context), guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, @@ -5576,9 +5576,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.root_role.word = 0; - vcpu->arch.guest_mmu.root_role.word = 0; - vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.root_role.invalid = 1; + vcpu->arch.guest_mmu.root_role.invalid = 1; + vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; @@ -7399,7 +7399,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d078157e62aa..04c1f0957fea 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1548,17 +1548,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ +static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +{ + /* + * All TDP MMU shadow pages share the same role as their root, aside + * from level, so it is valid to key off any shadow page to determine if + * write protection is needed for an entire tree. + */ + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); +} + static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; + const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; bool spte_set = false; @@ -1573,7 +1577,7 @@ retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) @@ -1590,11 +1594,9 @@ retry: } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. Returns true if an SPTE has been changed and the TLBs need to be + * flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -1610,18 +1612,11 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, return spte_set; } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1633,7 +1628,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || @@ -1659,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c397b28e3d1b..a593b03c9aed 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -775,8 +775,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->pebs_data_cfg_mask = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - if (vcpu->kvm->arch.enable_pmu) - static_call(kvm_x86_pmu_refresh)(vcpu); + if (!vcpu->kvm->arch.enable_pmu) + return; + + static_call(kvm_x86_pmu_refresh)(vcpu); + + /* + * At RESET, both Intel and AMD CPUs set all enable bits for general + * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that + * was written for v1 PMUs don't unknowingly leave GP counters disabled + * in the global controls). Emulate that behavior when refreshing the + * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. + */ + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561..2f4e155080ba 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -52,7 +52,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) #define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) -#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) /* CPUID level 0x80000007 (EDX). */ @@ -102,10 +102,12 @@ static const struct cpuid_reg reverse_cpuid[] = { */ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) { + BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS); BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_5); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } @@ -126,6 +128,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); default: return x86_feature; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ae0ac12382b9..759581bb2128 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -84,9 +84,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -116,7 +117,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm) } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -143,8 +144,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; + int ret; + + if (min_asid > max_asid) + return -ENOTTY; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -157,12 +170,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -179,7 +186,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -187,7 +195,7 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -247,21 +255,19 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_platform_init_args init_args = {0}; - int asid, ret; + int ret; if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; init_args.probe = false; ret = sev_platform_init(&init_args); @@ -287,8 +293,8 @@ e_no_asid: static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -428,7 +434,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); @@ -2240,8 +2246,10 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ @@ -2272,7 +2280,9 @@ void __init sev_hardware_setup(void) out: if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? "enabled" : "disabled", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", @@ -2320,7 +2330,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2638,7 +2648,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + unsigned int asid = sev_get_asid(svm->vcpu.kvm); /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -3174,7 +3184,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) unsigned long pfn; struct page *p; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1a9f9951635..9aaf83c8d57d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1503,6 +1503,11 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return page_address(sd->save_area) + 0x400; +} + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1519,12 +1524,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * or subsequent vmload of host save area. */ vmsave(sd->save_area_pa); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -4101,6 +4102,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); @@ -4108,7 +4110,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); else __svm_vcpu_run(svm, spec_ctrl_intercepted); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7f1fbd874c45..33878efdebc8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -698,7 +698,8 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, + struct sev_es_save_area *hostsa); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 187018c424bf..a0c8eb37d3e1 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/asm-offsets.h> #include <asm/bitsperlong.h> +#include <asm/frame.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> #include "kvm-asm-offsets.h" @@ -67,7 +68,7 @@ "", X86_FEATURE_V_SPEC_CTRL 901: .endm -.macro RESTORE_HOST_SPEC_CTRL_BODY +.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req 900: /* Same for after vmexit. */ mov $MSR_IA32_SPEC_CTRL, %ecx @@ -76,7 +77,7 @@ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, * if it was not intercepted during guest execution. */ - cmpb $0, (%_ASM_SP) + cmpb $0, \spec_ctrl_intercepted jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) @@ -99,6 +100,7 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP + mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 @@ -268,7 +270,7 @@ SYM_FUNC_START(__svm_vcpu_run) RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b @@ -290,66 +292,68 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) +#ifdef CONFIG_KVM_AMD_SEV + + +#ifdef CONFIG_X86_64 +#define SEV_ES_GPRS_BASE 0x300 +#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) +#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE) +#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE) +#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) +#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) +#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) +#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE) +#endif + /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %_ASM_BP -#ifdef CONFIG_X86_64 - push %r15 - push %r14 - push %r13 - push %r12 -#else - push %edi - push %esi -#endif - push %_ASM_BX + FRAME_BEGIN /* - * Save variables needed after vmexit on the stack, in inverse - * order compared to when they are needed. + * Save non-volatile (callee-saved) registers to the host save area. + * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not + * saved on VMRUN. */ + mov %rbp, SEV_ES_RBP (%rdx) + mov %r15, SEV_ES_R15 (%rdx) + mov %r14, SEV_ES_R14 (%rdx) + mov %r13, SEV_ES_R13 (%rdx) + mov %r12, SEV_ES_R12 (%rdx) + mov %rbx, SEV_ES_RBX (%rdx) - /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %_ASM_ARG2 - - /* Save @svm. */ - push %_ASM_ARG1 - -.ifnc _ASM_ARG1, _ASM_DI /* - * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX - * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + * Save volatile registers that hold arguments that are needed after + * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). */ - mov %_ASM_ARG1, %_ASM_DI -.endif + mov %rdi, SEV_ES_RDI (%rdx) + mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX - mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX + mov SVM_current_vmcb(%rdi), %rax + mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ sti -1: vmrun %_ASM_AX +1: vmrun %rax 2: cli - /* Pop @svm to RDI, guest registers have been saved already. */ - pop %_ASM_DI - #ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL /* @@ -361,30 +365,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" @spec_ctrl_intercepted. */ - pop %_ASM_BX - - pop %_ASM_BX - -#ifdef CONFIG_X86_64 - pop %r12 - pop %r13 - pop %r14 - pop %r15 -#else - pop %esi - pop %edi -#endif - pop %_ASM_BP + FRAME_END RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY %sil -3: cmpb $0, _ASM_RIP(kvm_rebooting) +3: cmpb $0, kvm_rebooting(%rip) jne 2b ud2 _ASM_EXTABLE(1b, 3b) SYM_FUNC_END(__svm_sev_es_vcpu_run) +#endif /* CONFIG_KVM_AMD_SEV */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 88659de4d2a7..c6b4b1728006 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -735,13 +735,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit, * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), + TP_PROTO(__u64 rip, unsigned int asid, u64 address), TP_ARGS(rip, asid, address), TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) + __field( __u64, rip ) + __field( unsigned int, asid ) + __field( __u64, address ) ), TP_fast_assign( @@ -750,7 +750,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 12ade343a17e..be40474de6e4 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -535,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (cpuid_model_is_consistent(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d061..f6986dee6f8c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY_VMEXIT + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c37a89eda90f..22411f4aff53 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -218,6 +218,8 @@ module_param(ple_window_max, uint, 0444); int __read_mostly pt_mode = PT_MODE_SYSTEM; module_param(pt_mode, int, S_IRUGO); +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; + static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -7862,10 +7864,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7875,15 +7876,43 @@ static u64 vmx_get_perf_capabilities(void) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 65786dbe7d60..90f9e4434646 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -15,6 +15,7 @@ #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" +#include "../mmu.h" #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -109,6 +110,8 @@ struct lbr_desc { bool msr_passthrough; }; +extern struct x86_pmu_lbr vmx_lbr_caps; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -719,7 +722,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits(); } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..91478b769af0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1621,7 +1621,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) static u64 kvm_get_arch_capabilities(void) { @@ -3470,7 +3470,7 @@ static bool is_mci_status_msr(u32 msr) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 721b528da9ac..391059b2c6fb 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -163,6 +163,7 @@ SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) lfence jmp srso_alias_return_thunk SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) .popsection .pushsection .text..__x86.rethunk_safe @@ -224,10 +225,16 @@ SYM_CODE_START(srso_return_thunk) SYM_CODE_END(srso_return_thunk) #define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" #else /* !CONFIG_MITIGATION_SRSO */ +/* Dummy for the alternative in CALL_UNTRAIN_RET. */ +SYM_CODE_START(srso_alias_untrain_ret) + ANNOTATE_UNRET_SAFE + ANNOTATE_NOENDBR + ret + int3 +SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) #define JMP_SRSO_UNTRAIN_RET "ud2" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" #endif /* CONFIG_MITIGATION_SRSO */ #ifdef CONFIG_MITIGATION_UNRET_ENTRY @@ -319,9 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret) #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) - ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ - JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ - JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS + ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) @@ -377,8 +382,15 @@ SYM_FUNC_END(call_depth_return_thunk) SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || \ + defined(CONFIG_MITIGATION_SRSO) || \ + defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \ "jmp warn_thunk_thunk", X86_FEATURE_ALWAYS +#else + ANNOTATE_UNRET_SAFE + ret +#endif int3 SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c index 1b118fd93140..39423ec409e1 100644 --- a/arch/x86/math-emu/fpu_etc.c +++ b/arch/x86/math-emu/fpu_etc.c @@ -120,9 +120,14 @@ static void fxam(FPU_REG *st0_ptr, u_char st0tag) setcc(c); } +static void FPU_ST0_illegal(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_illegal(); +} + static FUNC_ST0 const fp_etc_table[] = { - fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal, - ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal + fchs, fabs, FPU_ST0_illegal, FPU_ST0_illegal, + ftst_, fxam, FPU_ST0_illegal, FPU_ST0_illegal, }; void FPU_etc(void) diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 990d847ae902..85daf98c81c3 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -433,13 +433,13 @@ static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) #endif /* PARANOID */ } -static void fdecstp(void) +static void fdecstp(FPU_REG *st0_ptr, u_char st0_tag) { clear_C1(); top--; } -static void fincstp(void) +static void fincstp(FPU_REG *st0_ptr, u_char st0_tag) { clear_C1(); top++; @@ -1631,7 +1631,7 @@ static void fscale(FPU_REG *st0_ptr, u_char st0_tag) static FUNC_ST0 const trig_table_a[] = { f2xm1, fyl2x, fptan, fpatan, - fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp + fxtract, fprem1, fdecstp, fincstp, }; void FPU_triga(void) diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 742619e94bdf..003a0b2753e6 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -108,8 +108,13 @@ static void fldz(int rc) typedef void (*FUNC_RC) (int); +static void FPU_RC_illegal(int unused) +{ + FPU_illegal(); +} + static FUNC_RC constants_table[] = { - fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal + fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, FPU_RC_illegal }; void fconst(void) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index b522933bfa56..51986e8a9d35 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup, return ex_handler_default(fixup, regs); } -static bool ex_handler_copy(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) -{ - WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - return ex_handler_fault(fixup, regs, trapnr); -} - static bool ex_handler_msr(const struct exception_table_entry *fixup, struct pt_regs *regs, bool wrmsr, bool safe, int reg) { @@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: return ex_handler_uaccess(e, regs, trapnr, fault_addr); - case EX_TYPE_COPY: - return ex_handler_copy(e, regs, trapnr); case EX_TYPE_CLEAR_FS: return ex_handler_clear_fs(e, regs); case EX_TYPE_FPU_RESTORE: diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 622d12ec7f08..bba4e020dd64 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -723,39 +723,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { - /* - * Any interrupt that takes a fault gets the fixup. This makes - * the below recursive fault logic only apply to a faults from - * task context. - */ - if (in_interrupt()) - return; - - /* - * Per the above we're !in_interrupt(), aka. task context. - * - * In this case we need to make sure we're not recursively - * faulting through the emulate_vsyscall() logic. - */ - if (current->thread.sig_on_uaccess_err && signal) { - sanitize_error_code(address, &error_code); - - set_signal_archinfo(address, error_code); - - if (si_code == SEGV_PKUERR) { - force_sig_pkuerr((void __user *)address, pkey); - } else { - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); - } - } - - /* - * Barring that, we can do the fixup and be happy. - */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) return; - } /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index a204a332c71f..968d7005f4a7 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -26,31 +26,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - /* if this is already a gbpage, this portion is already mapped */ - if (pud_leaf(*pud)) - continue; - - /* Is using a gbpage allowed? */ - use_gbpage = info->direct_gbpages; - - /* Don't use gbpage if it maps more than the requested region. */ - /* at the begining: */ - use_gbpage &= ((addr & ~PUD_MASK) == 0); - /* ... or at the end: */ - use_gbpage &= ((next & ~PUD_MASK) == 0); - - /* Never overwrite existing mappings */ - use_gbpage &= !pud_present(*pud); - - if (use_gbpage) { + if (info->direct_gbpages) { pud_t pudval; + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 679893ea5e68..6b43b6480354 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -261,21 +261,17 @@ static void __init probe_page_size_mask(void) } } -#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ - .family = 6, \ - .model = _model, \ - } /* * INVLPG may not properly flush Global entries * on these CPUs when PCIDs are enabled. */ static const struct x86_cpu_id invlpg_miss_ids[] = { - INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), - INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), - INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ), - INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), - INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), - INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), + X86_MATCH_VFM(INTEL_ALDERLAKE, 0), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0), + X86_MATCH_VFM(INTEL_RAPTORLAKE, 0), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0), {} }; diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 6f3b3e028718..0a120d85d7bb 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -102,6 +102,13 @@ void __init mem_encrypt_setup_arch(void) phys_addr_t total_mem = memblock_phys_mem_size(); unsigned long size; + /* + * Do RMP table fixups after the e820 tables have been setup by + * e820__memory_setup(). + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + snp_fixup_e820_tables(); + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 70b91de2e053..422602f6039b 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -492,6 +492,24 @@ void __init sme_early_init(void) */ if (sev_status & MSR_AMD64_SEV_ENABLED) ia32_disable(); + + /* + * Override init functions that scan the ROM region in SEV-SNP guests, + * as this memory is not pre-validated and would thus cause a crash. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.pci.init_irq = x86_init_noop; + x86_init.resources.probe_roms = x86_init_noop; + + /* + * DMI setup behavior for SEV-SNP guests depends on + * efi_enabled(EFI_CONFIG_TABLES), which hasn't been + * parsed yet. snp_dmi_setup() will run after that + * parsing has happened. + */ + x86_init.resources.dmi_setup = snp_dmi_setup; + } } void __init mem_encrypt_free_decrypted_mem(void) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 104544359d69..025fd7ea5d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -24,6 +24,7 @@ #include <linux/memblock.h> #include <linux/init.h> +#include <asm/pgtable_areas.h> #include "numa_internal.h" diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 0d72183b5dd0..36b603d0cdde 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, + pgprot_t *pgprot) +{ + unsigned long prot; + + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); + + /* + * We need the starting PFN and cachemode used for track_pfn_remap() + * that covered the whole VMA. For most mappings, we can obtain that + * information from the page tables. For COW mappings, we might now + * suddenly have anon folios mapped and follow_phys() will fail. + * + * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to + * detect the PFN. If we need the cachemode as well, we're out of luck + * for now and have to fail fork(). + */ + if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (pgprot) + *pgprot = __pgprot(prot); + return 0; + } + if (is_cow_mapping(vma->vm_flags)) { + if (pgprot) + return -EINVAL; + *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return 0; + } + WARN_ON_ONCE(1); + return -EINVAL; +} + /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). @@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size) int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; - unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (vma->vm_flags & VM_PAT) { - /* - * reserve the whole chunk covered by vma. We need the - * starting address and protection from pte. - */ - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, &pgprot)) return -EINVAL; - } - pgprot = __pgprot(prot); + /* reserve the whole chunk covered by vma. */ return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } @@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { resource_size_t paddr; - unsigned long prot; if (vma && !(vma->vm_flags & VM_PAT)) return; @@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, /* free the chunk starting from pfn or the whole chunk */ paddr = (resource_size_t)pfn << PAGE_SHIFT; if (!paddr && !size) { - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { - WARN_ON_ONCE(1); + if (get_pat_info(vma, &paddr, NULL)) return; - } - size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a7ba8e178645..59cbc94b6e69 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip) static int emit_rsb_call(u8 **pprog, void *func, void *ip) { OPTIMIZER_HIDE_VAR(func); - x86_call_depth_emit_accounting(pprog, func); + ip += x86_call_depth_emit_accounting(pprog, func, ip); return emit_patch(pprog, func, ip, 0xE8); } @@ -1807,36 +1807,41 @@ populate_extable: if (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { /* Conservatively check that src_reg + insn->off is a kernel address: - * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE - * src_reg is used as scratch for src_reg += insn->off and restored - * after emit_ldx if necessary + * src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE + * and + * src_reg + insn->off < VSYSCALL_ADDR */ - u64 limit = TASK_SIZE_MAX + PAGE_SIZE; + u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR; u8 *end_of_jmp; - /* At end of these emitted checks, insn->off will have been added - * to src_reg, so no need to do relative load with insn->off offset - */ - insn_off = 0; + /* movabsq r10, VSYSCALL_ADDR */ + emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32, + (u32)(long)VSYSCALL_ADDR); - /* movabsq r11, limit */ - EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG)); - EMIT((u32)limit, 4); - EMIT(limit >> 32, 4); + /* mov src_reg, r11 */ + EMIT_mov(AUX_REG, src_reg); if (insn->off) { - /* add src_reg, insn->off */ - maybe_emit_1mod(&prog, src_reg, true); - EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off); + /* add r11, insn->off */ + maybe_emit_1mod(&prog, AUX_REG, true); + EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off); } - /* cmp src_reg, r11 */ - maybe_emit_mod(&prog, src_reg, AUX_REG, true); - EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG)); + /* sub r11, r10 */ + maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true); + EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX)); + + /* movabsq r10, limit */ + emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32, + (u32)(long)limit); + + /* cmp r10, r11 */ + maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true); + EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX)); - /* if unsigned '>=', goto load */ - EMIT2(X86_JAE, 0); + /* if unsigned '>', goto load */ + EMIT2(X86_JA, 0); end_of_jmp = prog; /* xor dst_reg, dst_reg */ @@ -1862,18 +1867,6 @@ populate_extable: /* populate jmp_offset for JMP above */ start_of_ldx[-1] = prog - start_of_ldx; - if (insn->off && src_reg != dst_reg) { - /* sub src_reg, insn->off - * Restore src_reg after "add src_reg, insn->off" in prev - * if statement. But if src_reg == dst_reg, emit_ldx - * above already clobbered src_reg, so no need to restore. - * If add src_reg, insn->off was unnecessary, no need to - * restore either. - */ - maybe_emit_1mod(&prog, src_reg, true); - EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off); - } - if (!bpf_prog->aux->extable) break; @@ -1972,20 +1965,17 @@ populate_extable: /* call */ case BPF_JMP | BPF_CALL: { - int offs; + u8 *ip = image + addrs[i - 1]; func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); - if (!imm32) - return -EINVAL; - offs = 7 + x86_call_depth_emit_accounting(&prog, func); - } else { - if (!imm32) - return -EINVAL; - offs = x86_call_depth_emit_accounting(&prog, func); + ip += 7; } - if (emit_call(&prog, func, image + addrs[i - 1] + offs)) + if (!imm32) + return -EINVAL; + ip += x86_call_depth_emit_accounting(&prog, func, ip); + if (emit_call(&prog, func, ip)) return -EINVAL; break; } @@ -2835,7 +2825,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * Direct-call fentry stub, as such it needs accounting for the * __fentry__ call. */ - x86_call_depth_emit_accounting(&prog, NULL); + x86_call_depth_emit_accounting(&prog, NULL, image); } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ @@ -3476,3 +3466,9 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +/* x86-64 JIT emits its own code to filter user addresses so return 0 here */ +u64 bpf_arch_uaddress_limit(void) +{ + return 0; +} diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 87313701f069..f5dbd25651e0 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -35,12 +35,6 @@ struct sim_dev_reg { struct sim_reg sim_reg; }; -struct sim_reg_op { - void (*init)(struct sim_dev_reg *reg); - void (*read)(struct sim_dev_reg *reg, u32 value); - void (*write)(struct sim_dev_reg *reg, u32 value); -}; - #define MB (1024 * 1024) #define KB (1024) #define SIZE_TO_MASK(size) (~(size - 1)) diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 944e0290f2c0..8c2d4b8de25d 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -75,6 +75,9 @@ static void __init init_pvh_bootparams(bool xen_guest) } else xen_raw_printk("Warning: Can fit ISA range into e820\n"); + if (xen_guest) + xen_reserve_extra_memory(&pvh_bootparams); + pvh_bootparams.hdr.cmd_line_ptr = pvh_start_info.cmdline_paddr; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index bc31863c5ee6..a18591f6e6d9 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -42,7 +42,8 @@ KCOV_INSTRUMENT := n # make up the standalone purgatory.ro PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel -PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0 +PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0 +PURGATORY_CFLAGS += -fpic -fvisibility=hidden PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING PURGATORY_CFLAGS += -fno-stack-protector diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b029fb81ebee..c101bed61940 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -11,41 +11,42 @@ #define Elf_Shdr ElfW(Shdr) #define Elf_Sym ElfW(Sym) -static Elf_Ehdr ehdr; -static unsigned long shnum; -static unsigned int shstrndx; -static unsigned int shsymtabndx; -static unsigned int shxsymtabndx; +static Elf_Ehdr ehdr; +static unsigned long shnum; +static unsigned int shstrndx; +static unsigned int shsymtabndx; +static unsigned int shxsymtabndx; static int sym_index(Elf_Sym *sym); struct relocs { - uint32_t *offset; - unsigned long count; - unsigned long size; + uint32_t *offset; + unsigned long count; + unsigned long size; }; -static struct relocs relocs16; -static struct relocs relocs32; +static struct relocs relocs16; +static struct relocs relocs32; + #if ELF_BITS == 64 -static struct relocs relocs32neg; -static struct relocs relocs64; -#define FMT PRIu64 +static struct relocs relocs32neg; +static struct relocs relocs64; +# define FMT PRIu64 #else -#define FMT PRIu32 +# define FMT PRIu32 #endif struct section { - Elf_Shdr shdr; - struct section *link; - Elf_Sym *symtab; - Elf32_Word *xsymtab; - Elf_Rel *reltab; - char *strtab; + Elf_Shdr shdr; + struct section *link; + Elf_Sym *symtab; + Elf32_Word *xsymtab; + Elf_Rel *reltab; + char *strtab; }; -static struct section *secs; +static struct section *secs; -static const char * const sym_regex_kernel[S_NSYMTYPES] = { +static const char * const sym_regex_kernel[S_NSYMTYPES] = { /* * Following symbols have been audited. There values are constant and do * not change if bzImage is loaded at a different physical address than @@ -115,13 +116,13 @@ static const char * const sym_regex_realmode[S_NSYMTYPES] = { "^pa_", }; -static const char * const *sym_regex; +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; -static regex_t sym_regex_c[S_NSYMTYPES]; static int is_reloc(enum symtype type, const char *sym_name) { - return sym_regex[type] && - !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); + return sym_regex[type] && !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); } static void regex_init(int use_real_mode) @@ -139,8 +140,7 @@ static void regex_init(int use_real_mode) if (!sym_regex[i]) continue; - err = regcomp(&sym_regex_c[i], sym_regex[i], - REG_EXTENDED|REG_NOSUB); + err = regcomp(&sym_regex_c[i], sym_regex[i], REG_EXTENDED|REG_NOSUB); if (err) { regerror(err, &sym_regex_c[i], errbuf, sizeof(errbuf)); @@ -163,9 +163,10 @@ static const char *sym_type(unsigned type) #undef SYM_TYPE }; const char *name = "unknown sym type name"; - if (type < ARRAY_SIZE(type_name)) { + + if (type < ARRAY_SIZE(type_name)) name = type_name[type]; - } + return name; } @@ -179,9 +180,10 @@ static const char *sym_bind(unsigned bind) #undef SYM_BIND }; const char *name = "unknown sym bind name"; - if (bind < ARRAY_SIZE(bind_name)) { + + if (bind < ARRAY_SIZE(bind_name)) name = bind_name[bind]; - } + return name; } @@ -196,9 +198,10 @@ static const char *sym_visibility(unsigned visibility) #undef SYM_VISIBILITY }; const char *name = "unknown sym visibility name"; - if (visibility < ARRAY_SIZE(visibility_name)) { + + if (visibility < ARRAY_SIZE(visibility_name)) name = visibility_name[visibility]; - } + return name; } @@ -244,9 +247,10 @@ static const char *rel_type(unsigned type) #undef REL_TYPE }; const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name) && type_name[type]) { + + if (type < ARRAY_SIZE(type_name) && type_name[type]) name = type_name[type]; - } + return name; } @@ -256,15 +260,14 @@ static const char *sec_name(unsigned shndx) const char *name; sec_strtab = secs[shstrndx].strtab; name = "<noname>"; - if (shndx < shnum) { + + if (shndx < shnum) name = sec_strtab + secs[shndx].shdr.sh_name; - } - else if (shndx == SHN_ABS) { + else if (shndx == SHN_ABS) name = "ABSOLUTE"; - } - else if (shndx == SHN_COMMON) { + else if (shndx == SHN_COMMON) name = "COMMON"; - } + return name; } @@ -272,18 +275,19 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) { const char *name; name = "<noname>"; - if (sym->st_name) { + + if (sym->st_name) name = sym_strtab + sym->st_name; - } - else { + else name = sec_name(sym_index(sym)); - } + return name; } static Elf_Sym *sym_lookup(const char *symname) { int i; + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; long nsyms; @@ -309,14 +313,15 @@ static Elf_Sym *sym_lookup(const char *symname) } #if BYTE_ORDER == LITTLE_ENDIAN -#define le16_to_cpu(val) (val) -#define le32_to_cpu(val) (val) -#define le64_to_cpu(val) (val) +# define le16_to_cpu(val) (val) +# define le32_to_cpu(val) (val) +# define le64_to_cpu(val) (val) #endif + #if BYTE_ORDER == BIG_ENDIAN -#define le16_to_cpu(val) bswap_16(val) -#define le32_to_cpu(val) bswap_32(val) -#define le64_to_cpu(val) bswap_64(val) +# define le16_to_cpu(val) bswap_16(val) +# define le32_to_cpu(val) bswap_32(val) +# define le64_to_cpu(val) bswap_64(val) #endif static uint16_t elf16_to_cpu(uint16_t val) @@ -337,13 +342,13 @@ static uint64_t elf64_to_cpu(uint64_t val) { return le64_to_cpu(val); } -#define elf_addr_to_cpu(x) elf64_to_cpu(x) -#define elf_off_to_cpu(x) elf64_to_cpu(x) -#define elf_xword_to_cpu(x) elf64_to_cpu(x) +# define elf_addr_to_cpu(x) elf64_to_cpu(x) +# define elf_off_to_cpu(x) elf64_to_cpu(x) +# define elf_xword_to_cpu(x) elf64_to_cpu(x) #else -#define elf_addr_to_cpu(x) elf32_to_cpu(x) -#define elf_off_to_cpu(x) elf32_to_cpu(x) -#define elf_xword_to_cpu(x) elf32_to_cpu(x) +# define elf_addr_to_cpu(x) elf32_to_cpu(x) +# define elf_off_to_cpu(x) elf32_to_cpu(x) +# define elf_xword_to_cpu(x) elf32_to_cpu(x) #endif static int sym_index(Elf_Sym *sym) @@ -365,22 +370,17 @@ static int sym_index(Elf_Sym *sym) static void read_ehdr(FILE *fp) { - if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { - die("Cannot read ELF header: %s\n", - strerror(errno)); - } - if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) + die("Cannot read ELF header: %s\n", strerror(errno)); + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) die("No ELF magic\n"); - } - if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) { + if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) die("Not a %d bit executable\n", ELF_BITS); - } - if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) die("Not a LSB ELF executable\n"); - } - if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) die("Unknown ELF version\n"); - } + /* Convert the fields to native endian */ ehdr.e_type = elf_half_to_cpu(ehdr.e_type); ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine); @@ -439,19 +439,18 @@ static void read_shdrs(FILE *fp) Elf_Shdr shdr; secs = calloc(shnum, sizeof(struct section)); - if (!secs) { - die("Unable to allocate %ld section headers\n", - shnum); - } - if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - ehdr.e_shoff, strerror(errno)); - } + if (!secs) + die("Unable to allocate %ld section headers\n", shnum); + + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno)); + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) - die("Cannot read ELF section headers %d/%ld: %s\n", - i, shnum, strerror(errno)); + die("Cannot read ELF section headers %d/%ld: %s\n", i, shnum, strerror(errno)); + sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name); sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type); sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags); @@ -471,31 +470,28 @@ static void read_shdrs(FILE *fp) static void read_strtabs(FILE *fp) { int i; + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { + + if (sec->shdr.sh_type != SHT_STRTAB) continue; - } + sec->strtab = malloc(sec->shdr.sh_size); - if (!sec->strtab) { - die("malloc of %" FMT " bytes for strtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } + if (!sec->strtab) + die("malloc of %" FMT " bytes for strtab failed\n", sec->shdr.sh_size); + + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno)); + + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size) + die("Cannot read symbol table: %s\n", strerror(errno)); } } static void read_symtabs(FILE *fp) { - int i,j; + int i, j; for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; @@ -504,19 +500,15 @@ static void read_symtabs(FILE *fp) switch (sec->shdr.sh_type) { case SHT_SYMTAB_SHNDX: sec->xsymtab = malloc(sec->shdr.sh_size); - if (!sec->xsymtab) { - die("malloc of %" FMT " bytes for xsymtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read extended symbol table: %s\n", - strerror(errno)); - } + if (!sec->xsymtab) + die("malloc of %" FMT " bytes for xsymtab failed\n", sec->shdr.sh_size); + + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno)); + + if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size) + die("Cannot read extended symbol table: %s\n", strerror(errno)); + shxsymtabndx = i; continue; @@ -524,19 +516,15 @@ static void read_symtabs(FILE *fp) num_syms = sec->shdr.sh_size / sizeof(Elf_Sym); sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %" FMT " bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } + if (!sec->symtab) + die("malloc of %" FMT " bytes for symtab failed\n", sec->shdr.sh_size); + + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno)); + + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size) + die("Cannot read symbol table: %s\n", strerror(errno)); + for (j = 0; j < num_syms; j++) { Elf_Sym *sym = &sec->symtab[j]; @@ -557,28 +545,27 @@ static void read_symtabs(FILE *fp) static void read_relocs(FILE *fp) { - int i,j; + int i, j; + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL_TYPE) { + + if (sec->shdr.sh_type != SHT_REL_TYPE) continue; - } + sec->reltab = malloc(sec->shdr.sh_size); - if (!sec->reltab) { - die("malloc of %" FMT " bytes for relocs failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } + if (!sec->reltab) + die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size); + + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) + die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno)); + + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size) + die("Cannot read symbol table: %s\n", strerror(errno)); + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) { Elf_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf_addr_to_cpu(rel->r_offset); rel->r_info = elf_xword_to_cpu(rel->r_info); #if (SHT_REL_TYPE == SHT_RELA) @@ -601,23 +588,27 @@ static void print_absolute_symbols(void) printf("Absolute symbols\n"); printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; char *sym_strtab; int j; - if (sec->shdr.sh_type != SHT_SYMTAB) { + if (sec->shdr.sh_type != SHT_SYMTAB) continue; - } + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) { Elf_Sym *sym; const char *name; + sym = &sec->symtab[j]; name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { + + if (sym->st_shndx != SHN_ABS) continue; - } + printf(format, j, sym->st_value, sym->st_size, sym_type(ELF_ST_TYPE(sym->st_info)), @@ -645,34 +636,37 @@ static void print_absolute_relocs(void) char *sym_strtab; Elf_Sym *sh_symtab; int j; - if (sec->shdr.sh_type != SHT_REL_TYPE) { + + if (sec->shdr.sh_type != SHT_REL_TYPE) continue; - } + sec_symtab = sec->link; sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) continue; - } + /* * Do not perform relocations in .notes section; any * values there are meant for pre-boot consumption (e.g. * startup_xen). */ - if (sec_applies->shdr.sh_type == SHT_NOTE) { + if (sec_applies->shdr.sh_type == SHT_NOTE) continue; - } + sh_symtab = sec_symtab->symtab; sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) { Elf_Rel *rel; Elf_Sym *sym; const char *name; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF_R_SYM(rel->r_info)]; name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { + + if (sym->st_shndx != SHN_ABS) continue; - } /* Absolute symbols are not relocated if bzImage is * loaded at a non-compiled address. Display a warning @@ -691,10 +685,8 @@ static void print_absolute_relocs(void) continue; if (!printed) { - printf("WARNING: Absolute relocations" - " present\n"); - printf("Offset Info Type Sym.Value " - "Sym.Name\n"); + printf("WARNING: Absolute relocations present\n"); + printf("Offset Info Type Sym.Value Sym.Name\n"); printed = 1; } @@ -718,8 +710,8 @@ static void add_reloc(struct relocs *r, uint32_t offset) void *mem = realloc(r->offset, newsize * sizeof(r->offset[0])); if (!mem) - die("realloc of %ld entries for relocs failed\n", - newsize); + die("realloc of %ld entries for relocs failed\n", newsize); + r->offset = mem; r->size = newsize; } @@ -730,6 +722,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { int i; + /* Walk through the relocations */ for (i = 0; i < shnum; i++) { char *sym_strtab; @@ -738,16 +731,25 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, int j; struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL_TYPE) { + if (sec->shdr.sh_type != SHT_REL_TYPE) continue; - } + sec_symtab = sec->link; sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) continue; - } + + /* + * Do not perform relocations in .notes sections; any + * values there are meant for pre-boot consumption (e.g. + * startup_xen). + */ + if (sec_applies->shdr.sh_type == SHT_NOTE) + continue; + sh_symtab = sec_symtab->symtab; sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) { Elf_Rel *rel = &sec->reltab[j]; Elf_Sym *sym = &sh_symtab[ELF_R_SYM(rel->r_info)]; @@ -781,14 +783,16 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, * kernel data and does not require special treatment. * */ -static int per_cpu_shndx = -1; +static int per_cpu_shndx = -1; static Elf_Addr per_cpu_load_addr; static void percpu_init(void) { int i; + for (i = 0; i < shnum; i++) { ElfW(Sym) *sym; + if (strcmp(sec_name(i), ".data..percpu")) continue; @@ -801,6 +805,7 @@ static void percpu_init(void) per_cpu_shndx = i; per_cpu_load_addr = sym->st_value; + return; } } @@ -871,8 +876,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, * Only used by jump labels */ if (is_percpu_sym(sym, symname)) - die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", - symname); + die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname); break; case R_X86_64_32: @@ -892,8 +896,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, if (is_reloc(S_ABS, symname)) break; - die("Invalid absolute %s relocation: %s\n", - rel_type(r_type), symname); + die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname); break; } @@ -913,8 +916,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); + die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type); break; } @@ -951,8 +953,7 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, if (is_reloc(S_ABS, symname)) break; - die("Invalid absolute %s relocation: %s\n", - rel_type(r_type), symname); + die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname); break; } @@ -960,16 +961,14 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, break; default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); + die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type); break; } return 0; } -static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, - const char *symname) +static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname) { unsigned r_type = ELF32_R_TYPE(rel->r_info); int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); @@ -1004,9 +1003,7 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, if (!is_reloc(S_LIN, symname)) break; } - die("Invalid %s %s relocation: %s\n", - shn_abs ? "absolute" : "relative", - rel_type(r_type), symname); + die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname); break; case R_386_32: @@ -1027,14 +1024,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, add_reloc(&relocs32, rel->r_offset); break; } - die("Invalid %s %s relocation: %s\n", - shn_abs ? "absolute" : "relative", - rel_type(r_type), symname); + die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname); break; default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); + die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type); break; } @@ -1046,7 +1040,10 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, static int cmp_relocs(const void *va, const void *vb) { const uint32_t *a, *b; - a = va; b = vb; + + a = va; + b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; } @@ -1060,6 +1057,7 @@ static int write32(uint32_t v, FILE *f) unsigned char buf[4]; put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; } @@ -1072,8 +1070,7 @@ static void emit_relocs(int as_text, int use_real_mode) { int i; int (*write_reloc)(uint32_t, FILE *) = write32; - int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, - const char *symname); + int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname); #if ELF_BITS == 64 if (!use_real_mode) @@ -1160,6 +1157,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, rel_type(ELF_R_TYPE(rel->r_info)), symname, sec_name(sym_index(sym))); + return 0; } @@ -1185,19 +1183,24 @@ void process(FILE *fp, int use_real_mode, int as_text, read_strtabs(fp); read_symtabs(fp); read_relocs(fp); + if (ELF_BITS == 64) percpu_init(); + if (show_absolute_syms) { print_absolute_symbols(); return; } + if (show_absolute_relocs) { print_absolute_relocs(); return; } + if (show_reloc_info) { print_reloc_info(); return; } + emit_relocs(as_text, use_real_mode); } diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile index 1e36502cd738..ea343fc392dc 100644 --- a/arch/x86/virt/Makefile +++ b/arch/x86/virt/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += vmx/ +obj-y += svm/ vmx/ diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index cffe1157a90a..0ae10535c699 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -77,7 +77,7 @@ static int __mfd_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -98,7 +98,7 @@ static int __snp_enable(unsigned int cpu) { u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; rdmsrl(MSR_AMD64_SYSCFG, val); @@ -163,6 +163,42 @@ bool snp_probe_rmptable_info(void) return true; } +static void __init __snp_fixup_e820_tables(u64 pa) +{ + if (IS_ALIGNED(pa, PMD_SIZE)) + return; + + /* + * Handle cases where the RMP table placement by the BIOS is not + * 2M aligned and the kexec kernel could try to allocate + * from within that chunk which then causes a fatal RMP fault. + * + * The e820_table needs to be updated as it is converted to + * kernel memory resources and used by KEXEC_FILE_LOAD syscall + * to load kexec segments. + * + * The e820_table_firmware needs to be updated as it is exposed + * to sysfs and used by the KEXEC_LOAD syscall to load kexec + * segments. + * + * The e820_table_kexec needs to be updated as it passed to + * the kexec-ed kernel. + */ + pa = ALIGN_DOWN(pa, PMD_SIZE); + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + } +} + +void __init snp_fixup_e820_tables(void) +{ + __snp_fixup_e820_tables(probed_rmp_base); + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); +} + /* * Do the necessary preparations which are verified by the firmware as * described in the SNP_INIT_EX firmware command description in the SNP @@ -174,11 +210,11 @@ static int __init snp_rmptable_init(void) u64 rmptable_size; u64 val; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return 0; if (!amd_iommu_snp_en) - return 0; + goto nosnp; if (!probed_rmp_size) goto nosnp; @@ -225,7 +261,7 @@ skip_enable: return 0; nosnp: - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); return -ENOSYS; } @@ -246,7 +282,7 @@ static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) { struct rmpentry *large_entry, *entry; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return ERR_PTR(-ENODEV); entry = get_rmpentry(pfn); @@ -363,7 +399,7 @@ int psmash(u64 pfn) unsigned long paddr = pfn << PAGE_SHIFT; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; if (!pfn_valid(pfn)) @@ -472,7 +508,7 @@ static int rmpupdate(u64 pfn, struct rmp_state *state) unsigned long paddr = pfn << PAGE_SHIFT; int ret, level; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return -ENODEV; level = RMP_TO_PG_LEVEL(state->pagesize); @@ -558,3 +594,13 @@ void snp_leak_pages(u64 pfn, unsigned int npages) spin_unlock(&snp_leaked_pages_list_lock); } EXPORT_SYMBOL_GPL(snp_leak_pages); + +void kdump_sev_callback(void) +{ + /* + * Do wbinvd() on remote CPUs when SNP is enabled in order to + * safely do SNP_SHUTDOWN on the local CPU. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + wbinvd(); +} diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 4d6826a76f78..49a1c6890b55 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -27,7 +27,6 @@ #include <linux/log2.h> #include <linux/acpi.h> #include <linux/suspend.h> -#include <linux/acpi.h> #include <asm/page.h> #include <asm/special_insns.h> #include <asm/msr-index.h> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index a65fc2ae15b4..77e788e928cd 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -81,7 +81,6 @@ config XEN_PVH bool "Xen PVH guest support" depends on XEN && XEN_PVHVM && ACPI select PVH - def_bool n help Support for running as a Xen PVH guest. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3c61bb98c10e..5f3a69f6ec34 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -#include <linux/memblock.h> -#endif #include <linux/console.h> #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <linux/slab.h> #include <linux/panic_notifier.h> @@ -350,3 +348,34 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +/* Amount of extra memory space we add to the e820 ranges */ +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns) +{ + unsigned int i; + + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); + + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); +} diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ace2eb054053..9ba53814ed6a 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -219,13 +219,21 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { - unsigned maskebx = ~0; + unsigned int maskebx = ~0; + unsigned int or_ebx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ switch (*ax) { + case 0x1: + /* Replace initial APIC ID in bits 24-31 of EBX. */ + /* See xen_pv_smp_config() for related topology preparations. */ + maskebx = 0x00ffffff; + or_ebx = smp_processor_id() << 24; + break; + case CPUID_MWAIT_LEAF: /* Synthesize the values.. */ *ax = 0; @@ -248,6 +256,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, : "0" (*ax), "2" (*cx)); *bx &= maskebx; + *bx |= or_ebx; } static bool __init xen_check_mwait(void) diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 9e9db601bd52..27a2a02ef8fb 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> #include <linux/export.h> +#include <linux/mm.h> #include <xen/hvc-console.h> @@ -73,3 +74,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p) } boot_params_p->e820_entries = memmap.nr_entries; } + +/* + * Reserve e820 UNUSABLE regions to inflate the memory balloon. + * + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are + * located as the same place as in the native memory map, but since dom0 gets + * less memory than the total amount of host RAM the ranges that can't be + * populated are converted from RAM -> UNUSABLE. Use such regions (up to the + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at + * boot. Doing so prevents the guest (even if just temporary) from using holes + * in the memory map in order to map grants or foreign addresses, and + * hopefully limits the risk of a clash with a device MMIO region. Ideally the + * hypervisor should notify us which memory ranges are suitable for creating + * foreign mappings, but that's not yet implemented. + */ +void __init xen_reserve_extra_memory(struct boot_params *bootp) +{ + unsigned int i, ram_pages = 0, extra_pages; + + for (i = 0; i < bootp->e820_entries; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + + if (e->type != E820_TYPE_RAM) + continue; + ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr); + } + + /* Max amount of extra memory. */ + extra_pages = EXTRA_MEM_RATIO * ram_pages; + + /* + * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping + * purposes. + */ + for (i = 0; i < bootp->e820_entries && extra_pages; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + unsigned long pages; + + if (e->type != E820_TYPE_UNUSABLE) + continue; + + pages = min(extra_pages, + PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr)); + + if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) { + struct boot_e820_entry *next; + + if (bootp->e820_entries == + ARRAY_SIZE(bootp->e820_table)) + /* No space left to split - skip region. */ + continue; + + /* Split entry. */ + next = e + 1; + memmove(next, e, + (bootp->e820_entries - i) * sizeof(*e)); + bootp->e820_entries++; + next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages); + e->size = next->addr - e->addr; + next->size -= e->size; + } + e->type = E820_TYPE_RAM; + extra_pages -= pages; + + xen_add_extra_mem(PFN_UP(e->addr), pages); + } +} diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b3e37961065a..380591028cb8 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -38,9 +38,6 @@ #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Amount of extra memory space we add to the e820 ranges */ -struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; - /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; @@ -64,18 +61,6 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; -/* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define EXTRA_MEM_RATIO (10) - static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void) xen_512gb_limit = val; } -static void __init xen_add_extra_mem(unsigned long start_pfn, - unsigned long n_pfns) -{ - int i; - - /* - * No need to check for zero size, should happen rarely and will only - * write a new entry regarded to be unused due to zero size. - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - /* Add new region. */ - if (xen_extra_mem[i].n_pfns == 0) { - xen_extra_mem[i].start_pfn = start_pfn; - xen_extra_mem[i].n_pfns = n_pfns; - break; - } - /* Append to existing region. */ - if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == - start_pfn) { - xen_extra_mem[i].n_pfns += n_pfns; - break; - } - } - if (i == XEN_EXTRA_MEM_MAX_REGIONS) - printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - - memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); -} - static void __init xen_del_extra_mem(unsigned long start_pfn, unsigned long n_pfns) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 27d1a5b7f571..ac41d83b38d3 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -154,9 +154,9 @@ static void __init xen_pv_smp_config(void) u32 apicid = 0; int i; - topology_register_boot_apic(apicid++); + topology_register_boot_apic(apicid); - for (i = 1; i < nr_cpu_ids; i++) + for (i = 0; i < nr_cpu_ids; i++) topology_register_apic(apicid++, CPU_ACPIID_INVALID, true); /* Pretend to be a proper enumerated system */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 04101b984f24..758bcd47b72d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen) ANNOTATE_NOENDBR cld - leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp + leaq __top_init_kernel_stack(%rip), %rsp /* Set up %gs. * diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a87ab36889e7..79cf93f2c92f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled); static inline void xen_hvm_post_suspend(int suspend_cancelled) {} #endif +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns); + #endif /* XEN_OPS_H */ |