summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig47
-rw-r--r--arch/x86/Kconfig.cpu2
-rw-r--r--arch/x86/Makefile.um7
-rw-r--r--arch/x86/boot/compressed/mem.c2
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c2
-rw-r--r--arch/x86/boot/compressed/sev.c107
-rw-r--r--arch/x86/boot/compressed/sev.h4
-rw-r--r--arch/x86/boot/genimage.sh5
-rw-r--r--arch/x86/coco/tdx/tdx.c26
-rw-r--r--arch/x86/crypto/Kconfig11
-rw-r--r--arch/x86/entry/calling.h2
-rw-r--r--arch/x86/entry/common.c2
-rw-r--r--arch/x86/entry/entry.S12
-rw-r--r--arch/x86/entry/entry_64.S20
-rw-r--r--arch/x86/events/amd/ibs.c20
-rw-r--r--arch/x86/events/amd/uncore.c36
-rw-r--r--arch/x86/events/core.c4
-rw-r--r--arch/x86/events/intel/core.c130
-rw-r--r--arch/x86/events/intel/ds.c30
-rw-r--r--arch/x86/events/intel/uncore_snbep.c107
-rw-r--r--arch/x86/events/perf_event.h12
-rw-r--r--arch/x86/events/rapl.c1
-rw-r--r--arch/x86/hyperv/hv_init.c33
-rw-r--r--arch/x86/hyperv/hv_vtl.c45
-rw-r--r--arch/x86/hyperv/irqdomain.c4
-rw-r--r--arch/x86/hyperv/ivm.c27
-rw-r--r--arch/x86/include/asm/alternative.h32
-rw-r--r--arch/x86/include/asm/bug.h5
-rw-r--r--arch/x86/include/asm/cfi.h11
-rw-r--r--arch/x86/include/asm/cpu.h12
-rw-r--r--arch/x86/include/asm/cpufeatures.h10
-rw-r--r--arch/x86/include/asm/debugreg.h19
-rw-r--r--arch/x86/include/asm/ibt.h4
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/irqflags.h44
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h6
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/mwait.h37
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/nospec-branch.h75
-rw-r--r--arch/x86/include/asm/paravirt.h20
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/percpu.h28
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/sev-common.h2
-rw-r--r--arch/x86/include/asm/sighandling.h22
-rw-r--r--arch/x86/include/asm/tdx.h6
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h4
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h21
-rw-r--r--arch/x86/include/uapi/asm/e820.h4
-rw-r--r--arch/x86/include/uapi/asm/ldt.h4
-rw-r--r--arch/x86/include/uapi/asm/msr.h4
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h6
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h4
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h4
-rw-r--r--arch/x86/include/uapi/asm/signal.h8
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/alternative.c238
-rw-r--r--arch/x86/kernel/amd_nb.c9
-rw-r--r--arch/x86/kernel/cfi.c22
-rw-r--r--arch/x86/kernel/cpu/amd.c91
-rw-r--r--arch/x86/kernel/cpu/bugs.c349
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/common.c127
-rw-r--r--arch/x86/kernel/cpu/intel.c52
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c28
-rw-r--r--arch/x86/kernel/cpu/mce/core.c24
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c11
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c37
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c112
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c60
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c10
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c7
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c4
-rw-r--r--arch/x86/kernel/devicetree.c3
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/e820.c17
-rw-r--r--arch/x86/kernel/fpu/core.c6
-rw-r--r--arch/x86/kernel/fpu/signal.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.h22
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head32.c4
-rw-r--r--arch/x86/kernel/i8253.c3
-rw-r--r--arch/x86/kernel/ioport.c13
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/module.c6
-rw-r--r--arch/x86/kernel/nmi.c42
-rw-r--r--arch/x86/kernel/paravirt.c13
-rw-r--r--arch/x86/kernel/process.c40
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/reboot.c10
-rw-r--r--arch/x86/kernel/signal_32.c66
-rw-r--r--arch/x86/kernel/signal_64.c4
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/static_call.c4
-rw-r--r--arch/x86/kernel/traps.c134
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/uprobes.c14
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/kvm/cpuid.c22
-rw-r--r--arch/x86/kvm/cpuid.h1
-rw-r--r--arch/x86/kvm/emulate.c15
-rw-r--r--arch/x86/kvm/hyperv.c3
-rw-r--r--arch/x86/kvm/kvm_emulate.h5
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/mmu/mmu.c79
-rw-r--r--arch/x86/kvm/mtrr.c1
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/smm.c1
-rw-r--r--arch/x86/kvm/svm/avic.c66
-rw-r--r--arch/x86/kvm/svm/sev.c30
-rw-r--r--arch/x86/kvm/svm/svm.c55
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/svm/vmenter.S16
-rw-r--r--arch/x86/kvm/vmx/hyperv.c1
-rw-r--r--arch/x86/kvm/vmx/nested.c24
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c28
-rw-r--r--arch/x86/kvm/vmx/sgx.c5
-rw-r--r--arch/x86/kvm/vmx/vmx.c19
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c64
-rw-r--r--arch/x86/kvm/x86.h48
-rw-r--r--arch/x86/kvm/xen.c17
-rw-r--r--arch/x86/lib/copy_user_64.S18
-rw-r--r--arch/x86/lib/retpoline.S39
-rw-r--r--arch/x86/lib/x86-opcode-map.txt54
-rw-r--r--arch/x86/mm/init.c32
-rw-r--r--arch/x86/mm/init_64.c15
-rw-r--r--arch/x86/mm/kaslr.c10
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c4
-rw-r--r--arch/x86/mm/pat/cpa-test.c2
-rw-r--r--arch/x86/mm/pat/memtype.c52
-rw-r--r--arch/x86/mm/pat/set_memory.c6
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c58
-rw-r--r--arch/x86/pci/xen.c8
-rw-r--r--arch/x86/power/cpu.c14
-rw-r--r--arch/x86/tools/insn_decoder_test.c5
-rw-r--r--arch/x86/um/asm/checksum.h3
-rw-r--r--arch/x86/um/os-Linux/mcontext.c3
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c5
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/enlighten_pvh.c19
-rw-r--r--arch/x86/xen/multicalls.c26
-rw-r--r--arch/x86/xen/setup.c3
-rw-r--r--arch/x86/xen/smp_pv.c1
-rw-r--r--arch/x86/xen/xen-ops.h3
160 files changed, 2719 insertions, 988 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f8e9af827e0..2df0ae2a5e5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -137,7 +137,7 @@ config X86
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_GENERAL_HUGETLB
- select ARCH_WANT_HUGE_PMD_SHARE
+ select ARCH_WANT_HUGE_PMD_SHARE if X86_64
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
@@ -226,7 +226,7 @@ config X86
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select HAVE_EISA
+ select HAVE_EISA if X86_32
select HAVE_EXIT_THREAD
select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
@@ -894,6 +894,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
+ depends on PARAVIRT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
@@ -2419,6 +2420,7 @@ config STRICT_SIGALTSTACK_SIZE
config CFI_AUTO_DEFAULT
bool "Attempt to use FineIBT by default at boot time"
depends on FINEIBT
+ depends on !RUST || RUSTC_VERSION >= 108800
default y
help
Attempt to use FineIBT by default at boot time. If enabled,
@@ -2433,18 +2435,20 @@ config CC_HAS_NAMED_AS
def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null)
depends on CC_IS_GCC
+#
+# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN)
+# are incompatible with named address spaces with GCC < 13.3
+# (see GCC PR sanitizer/111736 and also PR sanitizer/115172).
+#
+
config CC_HAS_NAMED_AS_FIXED_SANITIZERS
- def_bool CC_IS_GCC && GCC_VERSION >= 130300
+ def_bool y
+ depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300
+ depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200
config USE_X86_SEG_SUPPORT
- def_bool y
- depends on CC_HAS_NAMED_AS
- #
- # -fsanitize=kernel-address (KASAN) and -fsanitize=thread
- # (KCSAN) are incompatible with named address spaces with
- # GCC < 13.3 - see GCC PR sanitizer/111736.
- #
- depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
+ def_bool CC_HAS_NAMED_AS
+ depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
@@ -2744,6 +2748,27 @@ config MITIGATION_SSB
of speculative execution in a similar way to the Meltdown and Spectre
security vulnerabilities.
+config MITIGATION_ITS
+ bool "Enable Indirect Target Selection mitigation"
+ depends on CPU_SUP_INTEL && X86_64
+ depends on MITIGATION_RETPOLINE && MITIGATION_RETHUNK
+ select EXECMEM
+ default y
+ help
+ Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in
+ BPU on some Intel CPUs that may allow Spectre V2 style attacks. If
+ disabled, mitigation cannot be enabled via cmdline.
+ See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
+
+config MITIGATION_TSA
+ bool "Mitigate Transient Scheduler Attacks"
+ depends on CPU_SUP_AMD
+ default y
+ help
+ Enable mitigation for Transient Scheduler Attacks. TSA is a hardware
+ security vulnerability on AMD CPUs which can lead to forwarding of
+ invalid info to subsequent instructions and thus can affect their
+ timing and thereby cause a leakage.
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2a7279d80460..42e6a40876ea 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -368,7 +368,7 @@ config X86_HAVE_PAE
config X86_CMPXCHG64
def_bool y
- depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX
# this should be set for all -march=.. options where the compiler
# generates cmov.
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index a46b1397ad01..c86cbd9cbba3 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -7,12 +7,13 @@ core-y += arch/x86/crypto/
# GCC versions < 11. See:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652
#
-ifeq ($(CONFIG_CC_IS_CLANG),y)
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
-KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+ifeq ($(call gcc-min-version, 110000)$(CONFIG_CC_IS_CLANG),y)
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
endif
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+
ifeq ($(CONFIG_X86_32),y)
START := 0x8048000
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
index dbba332e4a12..0e9f84ab4bdc 100644
--- a/arch/x86/boot/compressed/mem.c
+++ b/arch/x86/boot/compressed/mem.c
@@ -38,7 +38,7 @@ void arch_accept_memory(phys_addr_t start, phys_addr_t end)
if (early_is_tdx_guest()) {
if (!tdx_accept_memory(start, end))
panic("TDX: Failed to accept memory\n");
- } else if (sev_snp_enabled()) {
+ } else if (early_is_sevsnp_guest()) {
snp_accept_memory(start, end);
} else {
error("Cannot accept memory: unknown platform\n");
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index c882e1f67af0..d8c5de40669d 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
#include <asm/bootparam.h>
+#include <asm/bootparam_utils.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
#include "pgtable.h"
@@ -107,6 +108,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
+ sanitize_boot_params(bp);
boot_params_ptr = bp;
/*
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index cd44e120fe53..a93e36338866 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -164,10 +164,7 @@ bool sev_snp_enabled(void)
static void __page_state_change(unsigned long paddr, enum psc_op op)
{
- u64 val;
-
- if (!sev_snp_enabled())
- return;
+ u64 val, msr;
/*
* If private -> shared then invalidate the page before requesting the
@@ -176,6 +173,9 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
if (op == SNP_PAGE_STATE_SHARED)
pvalidate_4k_page(paddr, paddr, false);
+ /* Save the current GHCB MSR value */
+ msr = sev_es_rd_ghcb_msr();
+
/* Issue VMGEXIT to change the page state in RMP table. */
sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
VMGEXIT();
@@ -185,6 +185,9 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+ /* Restore the GHCB MSR value */
+ sev_es_wr_ghcb_msr(msr);
+
/*
* Now that page state is changed in the RMP table, validate it so that it is
* consistent with the RMP entry.
@@ -195,11 +198,17 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
void snp_set_page_private(unsigned long paddr)
{
+ if (!sev_snp_enabled())
+ return;
+
__page_state_change(paddr, SNP_PAGE_STATE_PRIVATE);
}
void snp_set_page_shared(unsigned long paddr)
{
+ if (!sev_snp_enabled())
+ return;
+
__page_state_change(paddr, SNP_PAGE_STATE_SHARED);
}
@@ -223,56 +232,10 @@ static bool early_setup_ghcb(void)
return true;
}
-static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc,
- phys_addr_t pa, phys_addr_t pa_end)
-{
- struct psc_hdr *hdr;
- struct psc_entry *e;
- unsigned int i;
-
- hdr = &desc->hdr;
- memset(hdr, 0, sizeof(*hdr));
-
- e = desc->entries;
-
- i = 0;
- while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) {
- hdr->end_entry = i;
-
- e->gfn = pa >> PAGE_SHIFT;
- e->operation = SNP_PAGE_STATE_PRIVATE;
- if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) {
- e->pagesize = RMP_PG_SIZE_2M;
- pa += PMD_SIZE;
- } else {
- e->pagesize = RMP_PG_SIZE_4K;
- pa += PAGE_SIZE;
- }
-
- e++;
- i++;
- }
-
- if (vmgexit_psc(boot_ghcb, desc))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-
- pvalidate_pages(desc);
-
- return pa;
-}
-
void snp_accept_memory(phys_addr_t start, phys_addr_t end)
{
- struct snp_psc_desc desc = {};
- unsigned int i;
- phys_addr_t pa;
-
- if (!boot_ghcb && !early_setup_ghcb())
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-
- pa = start;
- while (pa < end)
- pa = __snp_accept_memory(&desc, pa, end);
+ for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE)
+ __page_state_change(pa, SNP_PAGE_STATE_PRIVATE);
}
void sev_es_shutdown_ghcb(void)
@@ -681,3 +644,43 @@ void sev_prep_identity_maps(unsigned long top_level_pgt)
sev_verify_cbit(top_level_pgt);
}
+
+bool early_is_sevsnp_guest(void)
+{
+ static bool sevsnp;
+
+ if (sevsnp)
+ return true;
+
+ if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED))
+ return false;
+
+ sevsnp = true;
+
+ if (!snp_vmpl) {
+ unsigned int eax, ebx, ecx, edx;
+
+ /*
+ * CPUID Fn8000_001F_EAX[28] - SVSM support
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax & BIT(28)) {
+ struct msr m;
+
+ /* Obtain the address of the calling area to use */
+ boot_rdmsr(MSR_SVSM_CAA, &m);
+ boot_svsm_caa = (void *)m.q;
+ boot_svsm_caa_pa = m.q;
+
+ /*
+ * The real VMPL level cannot be discovered, but the
+ * memory acceptance routines make no use of that so
+ * any non-zero value suffices here.
+ */
+ snp_vmpl = U8_MAX;
+ }
+ }
+ return true;
+}
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index fc725a981b09..d3900384b8ab 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -12,11 +12,15 @@
bool sev_snp_enabled(void);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 sev_get_status(void);
+bool early_is_sevsnp_guest(void);
#else
static inline bool sev_snp_enabled(void) { return false; }
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 sev_get_status(void) { return 0; }
+static inline bool early_is_sevsnp_guest(void) { return false; }
#endif
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index c9299aeb7333..3882ead513f7 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -22,6 +22,7 @@
# This script requires:
# bash
# syslinux
+# genisoimage
# mtools (for fdimage* and hdimage)
# edk2/OVMF (for hdimage)
#
@@ -251,7 +252,9 @@ geniso() {
cp "$isolinux" "$ldlinux" "$tmp_dir"
cp "$FBZIMAGE" "$tmp_dir"/linux
echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
- cp "${FDINITRDS[@]}" "$tmp_dir"/
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ fi
genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
-quiet -o "$FIMAGE" -b isolinux.bin \
-c boot.cat -no-emul-boot -boot-load-size 4 \
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 2f85ed005c42..b8aeb3ac7d28 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@@ -359,7 +360,7 @@ static int handle_halt(struct ve_info *ve)
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@@ -370,6 +371,16 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@@ -1057,6 +1068,19 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
/*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
+ /*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
* there.
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 7b1bebed879d..46b53ab06165 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -3,10 +3,12 @@
menu "Accelerated Cryptographic Algorithms for CPU (x86)"
config CRYPTO_CURVE25519_X86
- tristate "Public key crypto: Curve25519 (ADX)"
+ tristate
depends on X86 && 64BIT
+ select CRYPTO_KPP
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ default CRYPTO_LIB_CURVE25519_INTERNAL
help
Curve25519 algorithm
@@ -348,11 +350,12 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64
Processes 64 blocks in parallel.
config CRYPTO_CHACHA20_X86_64
- tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)"
+ tristate
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ default CRYPTO_LIB_CHACHA_INTERNAL
help
Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
stream cipher algorithms
@@ -417,10 +420,12 @@ config CRYPTO_POLYVAL_CLMUL_NI
- CLMUL-NI (carry-less multiplication new instructions)
config CRYPTO_POLY1305_X86_64
- tristate "Hash functions: Poly1305 (SSE2/AVX2)"
+ tristate
depends on X86 && 64BIT
+ select CRYPTO_HASH
select CRYPTO_LIB_POLY1305_GENERIC
select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ default CRYPTO_LIB_POLY1305_INTERNAL
help
Poly1305 authenticator algorithm (RFC7539)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index ea81770629ee..626a81c6015b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -70,6 +70,8 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */
+ /* We just clobbered the return address - use the IRET frame for unwinding: */
+ UNWIND_HINT_IRET_REGS offset=3*8
.else
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 94941c5a10ac..51efd2da4d7f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -142,7 +142,7 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
#ifdef CONFIG_IA32_EMULATION
bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
-static int ia32_emulation_override_cmdline(char *arg)
+static int __init ia32_emulation_override_cmdline(char *arg)
{
return kstrtobool(arg, &__ia32_enabled);
}
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index b7ea3e8e9ecc..b0d5ab951231 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -18,7 +18,7 @@
SYM_FUNC_START(entry_ibpb)
movl $MSR_IA32_PRED_CMD, %ecx
- movl $PRED_CMD_IBPB, %eax
+ movl _ASM_RIP(x86_pred_cmd), %eax
xorl %edx, %edx
wrmsr
@@ -33,20 +33,20 @@ EXPORT_SYMBOL_GPL(entry_ibpb);
/*
* Define the VERW operand that is disguised as entry code so that
- * it can be referenced with KPTI enabled. This ensure VERW can be
+ * it can be referenced with KPTI enabled. This ensures VERW can be
* used late in exit-to-user path after page tables are switched.
*/
.pushsection .entry.text, "ax"
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_START_NOALIGN(mds_verw_sel)
+SYM_CODE_START_NOALIGN(x86_verw_sel)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
.word __KERNEL_DS
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_END(mds_verw_sel);
+SYM_CODE_END(x86_verw_sel);
/* For KVM */
-EXPORT_SYMBOL_GPL(mds_verw_sel);
+EXPORT_SYMBOL_GPL(x86_verw_sel);
.popsection
@@ -63,7 +63,7 @@ THUNK warn_thunk_thunk, __warn_thunk
* entirely in the C code, and use an alias emitted by the linker script
* instead.
*/
-#ifdef CONFIG_STACKPROTECTOR
+#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP)
EXPORT_SYMBOL(__ref_stack_chk_guard);
#endif
#endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1b5be07f8669..9c6a110a52d4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1524,7 +1524,9 @@ SYM_CODE_END(rewind_stack_and_make_dead)
* ORC to unwind properly.
*
* The alignment is for performance and not for safety, and may be safely
- * refactored in the future if needed.
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
*/
SYM_FUNC_START(clear_bhb_loop)
push %rbp
@@ -1534,10 +1536,22 @@ SYM_FUNC_START(clear_bhb_loop)
call 1f
jmp 5f
.align 64, 0xcc
+ /*
+ * Shift instructions so that the RET is in the upper half of the
+ * cacheline and don't take the slowpath to its_return_thunk.
+ */
+ .skip 32 - (.Lret1 - 1f), 0xcc
ANNOTATE_INTRA_FUNCTION_CALL
1: call 2f
- RET
+.Lret1: RET
.align 64, 0xcc
+ /*
+ * As above shift instructions for RET at .Lret2 as well.
+ *
+ * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+ * but some Clang versions (e.g. 18) don't like this.
+ */
+ .skip 32 - 18, 0xcc
2: movl $5, %eax
3: jmp 4f
nop
@@ -1545,7 +1559,7 @@ SYM_FUNC_START(clear_bhb_loop)
jnz 3b
sub $1, %ecx
jnz 1b
- RET
+.Lret2: RET
5: lfence
pop %rbp
RET
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index c3a2f6f57770..d34ee6f04f18 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -272,7 +272,7 @@ static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct perf_ibs *perf_ibs;
- u64 max_cnt, config;
+ u64 config;
int ret;
perf_ibs = get_ibs_pmu(event->attr.type);
@@ -309,10 +309,19 @@ static int perf_ibs_init(struct perf_event *event)
if (!hwc->sample_period)
hwc->sample_period = 0x10;
} else {
- max_cnt = config & perf_ibs->cnt_mask;
+ u64 period = 0;
+
+ if (perf_ibs == &perf_ibs_op) {
+ period = (config & IBS_OP_MAX_CNT) << 4;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ period |= config & IBS_OP_MAX_CNT_EXT_MASK;
+ } else {
+ period = (config & IBS_FETCH_MAX_CNT) << 4;
+ }
+
config &= ~perf_ibs->cnt_mask;
- event->attr.sample_period = max_cnt << 4;
- hwc->sample_period = event->attr.sample_period;
+ event->attr.sample_period = period;
+ hwc->sample_period = period;
}
if (!hwc->sample_period)
@@ -1222,7 +1231,8 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_OPCNTEXT) {
perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
- perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK;
+ perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK |
+ IBS_OP_CUR_CNT_EXT_MASK);
}
if (ibs_caps & IBS_CAPS_ZEN4)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 0bfde2ea5cb8..cdf7bf029836 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -38,7 +38,6 @@ struct amd_uncore_ctx {
int refcnt;
int cpu;
struct perf_event **events;
- struct hlist_node node;
};
struct amd_uncore_pmu {
@@ -890,6 +889,39 @@ static void amd_uncore_umc_start(struct perf_event *event, int flags)
perf_event_update_userpage(event);
}
+static void amd_uncore_umc_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, new, shift;
+ s64 delta;
+
+ shift = COUNTER_SHIFT + 1;
+ prev = local64_read(&hwc->prev_count);
+
+ /*
+ * UMC counters do not have RDPMC assignments. Read counts directly
+ * from the corresponding PERF_CTR.
+ */
+ rdmsrl(hwc->event_base, new);
+
+ /*
+ * Unlike the other uncore counters, UMC counters saturate and set the
+ * Overflow bit (bit 48) on overflow. Since they do not roll over,
+ * proactively reset the corresponding PERF_CTR when bit 47 is set so
+ * that the counter never gets a chance to saturate.
+ */
+ if (new & BIT_ULL(63 - COUNTER_SHIFT)) {
+ wrmsrl(hwc->event_base, 0);
+ local64_set(&hwc->prev_count, 0);
+ } else {
+ local64_set(&hwc->prev_count, new);
+ }
+
+ delta = (new << shift) - (prev << shift);
+ delta >>= shift;
+ local64_add(delta, &event->count);
+}
+
static
void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
{
@@ -967,7 +999,7 @@ int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
.del = amd_uncore_del,
.start = amd_uncore_umc_start,
.stop = amd_uncore_stop,
- .read = amd_uncore_read,
+ .read = amd_uncore_umc_read,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
.module = THIS_MODULE,
};
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 0d33c85da453..471eaa46d55f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -628,7 +628,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == event->pmu->type)
event->hw.config |= x86_pmu_get_event_config(event);
- if (!event->attr.freq && x86_pmu.limit_period) {
+ if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) {
s64 left = event->attr.sample_period;
x86_pmu.limit_period(event, &left);
if (left > event->attr.sample_period)
@@ -753,7 +753,7 @@ void x86_pmu_enable_all(int added)
}
}
-static inline int is_x86_event(struct perf_event *event)
+int is_x86_event(struct perf_event *event)
{
int i;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9ec3170c18f9..5e43d390f7a3 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2779,28 +2779,33 @@ static u64 icl_update_topdown_event(struct perf_event *event)
DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
-static void intel_pmu_read_topdown_event(struct perf_event *event)
+static void intel_pmu_read_event(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool pmu_enabled = cpuc->enabled;
- /* Only need to call update_topdown_event() once for group read. */
- if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
- !is_slots_event(event))
- return;
+ /* Only need to call update_topdown_event() once for group read. */
+ if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ))
+ return;
- perf_pmu_disable(event->pmu);
- static_call(intel_pmu_update_topdown_event)(event);
- perf_pmu_enable(event->pmu);
-}
+ cpuc->enabled = 0;
+ if (pmu_enabled)
+ intel_pmu_disable_all();
-static void intel_pmu_read_event(struct perf_event *event)
-{
- if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
- intel_pmu_auto_reload_read(event);
- else if (is_topdown_count(event))
- intel_pmu_read_topdown_event(event);
- else
- x86_perf_event_update(event);
+ if (is_topdown_event(event))
+ static_call(intel_pmu_update_topdown_event)(event);
+ else
+ intel_pmu_drain_pebs_buffer();
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ intel_pmu_enable_all(0);
+
+ return;
+ }
+
+ x86_perf_event_update(event);
}
static void intel_pmu_enable_fixed(struct perf_event *event)
@@ -3067,7 +3072,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
handled++;
x86_pmu_handle_guest_pebs(regs, &data);
- x86_pmu.drain_pebs(regs, &data);
+ static_call(x86_pmu_drain_pebs)(regs, &data);
status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
@@ -3949,6 +3954,85 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
}
+static u64 intel_pmu_freq_start_period(struct perf_event *event)
+{
+ int type = event->attr.type;
+ u64 config, factor;
+ s64 start;
+
+ /*
+ * The 127 is the lowest possible recommended SAV (sample after value)
+ * for a 4000 freq (default freq), according to the event list JSON file.
+ * Also, assume the workload is idle 50% time.
+ */
+ factor = 64 * 4000;
+ if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE)
+ goto end;
+
+ /*
+ * The estimation of the start period in the freq mode is
+ * based on the below assumption.
+ *
+ * For a cycles or an instructions event, 1GHZ of the
+ * underlying platform, 1 IPC. The workload is idle 50% time.
+ * The start period = 1,000,000,000 * 1 / freq / 2.
+ * = 500,000,000 / freq
+ *
+ * Usually, the branch-related events occur less than the
+ * instructions event. According to the Intel event list JSON
+ * file, the SAV (sample after value) of a branch-related event
+ * is usually 1/4 of an instruction event.
+ * The start period of branch-related events = 125,000,000 / freq.
+ *
+ * The cache-related events occurs even less. The SAV is usually
+ * 1/20 of an instruction event.
+ * The start period of cache-related events = 25,000,000 / freq.
+ */
+ config = event->attr.config & PERF_HW_EVENT_MASK;
+ if (type == PERF_TYPE_HARDWARE) {
+ switch (config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ case PERF_COUNT_HW_INSTRUCTIONS:
+ case PERF_COUNT_HW_BUS_CYCLES:
+ case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND:
+ case PERF_COUNT_HW_STALLED_CYCLES_BACKEND:
+ case PERF_COUNT_HW_REF_CPU_CYCLES:
+ factor = 500000000;
+ break;
+ case PERF_COUNT_HW_BRANCH_INSTRUCTIONS:
+ case PERF_COUNT_HW_BRANCH_MISSES:
+ factor = 125000000;
+ break;
+ case PERF_COUNT_HW_CACHE_REFERENCES:
+ case PERF_COUNT_HW_CACHE_MISSES:
+ factor = 25000000;
+ break;
+ default:
+ goto end;
+ }
+ }
+
+ if (type == PERF_TYPE_HW_CACHE)
+ factor = 25000000;
+end:
+ /*
+ * Usually, a prime or a number with less factors (close to prime)
+ * is chosen as an SAV, which makes it less likely that the sampling
+ * period synchronizes with some periodic event in the workload.
+ * Minus 1 to make it at least avoiding values near power of twos
+ * for the default freq.
+ */
+ start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1;
+
+ if (start > x86_pmu.max_period)
+ start = x86_pmu.max_period;
+
+ if (x86_pmu.limit_period)
+ x86_pmu.limit_period(event, &start);
+
+ return start;
+}
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3960,6 +4044,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (ret)
return ret;
+ if (event->attr.freq && event->attr.sample_freq) {
+ event->hw.sample_period = intel_pmu_freq_start_period(event);
+ event->hw.last_period = event->hw.sample_period;
+ local64_set(&event->hw.period_left, event->hw.sample_period);
+ }
+
if (event->attr.precise_ip) {
if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
return -EINVAL;
@@ -4243,7 +4333,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
arr[pebs_enable] = (struct perf_guest_switch_msr){
.msr = MSR_IA32_PEBS_ENABLE,
.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
- .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
+ .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable,
};
if (arr[pebs_enable].host) {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c07ca43e67e7..54007174c15b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -932,11 +932,11 @@ unlock:
return 1;
}
-static inline void intel_pmu_drain_pebs_buffer(void)
+void intel_pmu_drain_pebs_buffer(void)
{
struct perf_sample_data data;
- x86_pmu.drain_pebs(NULL, &data);
+ static_call(x86_pmu_drain_pebs)(NULL, &data);
}
/*
@@ -1317,8 +1317,10 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
* + precise_ip < 2 for the non event IP
* + For RTM TSX weight we need GPRs for the abort code.
*/
- gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
- (attr->sample_regs_intr & PEBS_GP_REGS);
+ gprs = ((sample_type & PERF_SAMPLE_REGS_INTR) &&
+ (attr->sample_regs_intr & PEBS_GP_REGS)) ||
+ ((sample_type & PERF_SAMPLE_REGS_USER) &&
+ (attr->sample_regs_user & PEBS_GP_REGS));
tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
((attr->config & INTEL_ARCH_EVENT_MASK) ==
@@ -1970,7 +1972,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
regs->flags &= ~PERF_EFLAGS_EXACT;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR)
+ if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
adaptive_pebs_save_regs(regs, gprs);
}
@@ -2079,15 +2081,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
return NULL;
}
-void intel_pmu_auto_reload_read(struct perf_event *event)
-{
- WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
- perf_pmu_disable(event->pmu);
- intel_pmu_drain_pebs_buffer();
- perf_pmu_enable(event->pmu);
-}
-
/*
* Special variant of intel_pmu_save_and_restart() for auto-reload.
*/
@@ -2247,8 +2240,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
setup_pebs_fixed_sample_data);
}
-static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
+static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask)
{
+ u64 pebs_enabled = cpuc->pebs_enabled & mask;
struct perf_event *event;
int bit;
@@ -2259,7 +2253,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
* It needs to call intel_pmu_save_and_restart_reload() to
* update the event->count for this case.
*/
- for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
+ for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_save_and_restart_reload(event, 0);
@@ -2294,7 +2288,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, size);
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
return;
}
@@ -2404,7 +2398,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
(hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
return;
}
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ca98744343b8..543609d1231e 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4891,28 +4891,28 @@ static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = {
INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
/* Free-Running IIO BANDWIDTH IN Counters */
INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.0517578125e-5"),
INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
{ /* end: all zeroes */ },
};
@@ -5485,37 +5485,6 @@ static struct freerunning_counters icx_iio_freerunning[] = {
[ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets },
};
-static struct uncore_event_desc icx_uncore_iio_freerunning_events[] = {
- /* Free-Running IIO CLOCKS Counter */
- INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
- /* Free-Running IIO BANDWIDTH IN Counters */
- INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
- { /* end: all zeroes */ },
-};
-
static struct intel_uncore_type icx_uncore_iio_free_running = {
.name = "iio_free_running",
.num_counters = 9,
@@ -5523,7 +5492,7 @@ static struct intel_uncore_type icx_uncore_iio_free_running = {
.num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX,
.freerunning = icx_iio_freerunning,
.ops = &skx_uncore_iio_freerunning_ops,
- .event_descs = icx_uncore_iio_freerunning_events,
+ .event_descs = snr_uncore_iio_freerunning_events,
.format_group = &skx_uncore_iio_freerunning_format_group,
};
@@ -6320,69 +6289,13 @@ static struct freerunning_counters spr_iio_freerunning[] = {
[SPR_IIO_MSR_BW_OUT] = { 0x3808, 0x1, 0x10, 8, 48 },
};
-static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = {
- /* Free-Running IIO CLOCKS Counter */
- INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
- /* Free-Running IIO BANDWIDTH IN Counters */
- INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"),
- /* Free-Running IIO BANDWIDTH OUT Counters */
- INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x30"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x31"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x32"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x33"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port4, "event=0xff,umask=0x34"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port5, "event=0xff,umask=0x35"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port6, "event=0xff,umask=0x36"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit, "MiB"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port7, "event=0xff,umask=0x37"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale, "3.814697266e-6"),
- INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit, "MiB"),
- { /* end: all zeroes */ },
-};
-
static struct intel_uncore_type spr_uncore_iio_free_running = {
.name = "iio_free_running",
.num_counters = 17,
.num_freerunning_types = SPR_IIO_FREERUNNING_TYPE_MAX,
.freerunning = spr_iio_freerunning,
.ops = &skx_uncore_iio_freerunning_ops,
- .event_descs = spr_uncore_iio_freerunning_events,
+ .event_descs = snr_uncore_iio_freerunning_events,
.format_group = &skx_uncore_iio_freerunning_format_group,
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ac1182141bf6..76b6d8469dba 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -110,9 +110,16 @@ static inline bool is_topdown_event(struct perf_event *event)
return is_metric_event(event) || is_slots_event(event);
}
+int is_x86_event(struct perf_event *event);
+
+static inline bool check_leader_group(struct perf_event *leader, int flags)
+{
+ return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false;
+}
+
static inline bool is_branch_counters_group(struct perf_event *event)
{
- return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
+ return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS);
}
struct amd_nb {
@@ -1092,6 +1099,7 @@ extern struct x86_pmu x86_pmu __read_mostly;
DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period);
DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update);
+DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
{
@@ -1626,7 +1634,7 @@ void intel_pmu_pebs_disable_all(void);
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_drain_pebs_buffer(void);
void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index a481a939862e..fc06b216aacd 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -846,6 +846,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl),
X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl),
{},
};
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 95eada2994e1..239f612816e6 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -730,3 +730,36 @@ bool hv_is_hyperv_initialized(void)
return hypercall_msr.enable;
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
+
+int hv_apicid_to_vp_index(u32 apic_id)
+{
+ u64 control;
+ u64 status;
+ unsigned long irq_flags;
+ struct hv_get_vp_from_apic_id_in *input;
+ u32 *output, ret;
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->apic_ids[0] = apic_id;
+
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
+ status = hv_do_hypercall(control, input, output);
+ ret = output[0];
+
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("failed to get vp index from apic id %d, status %#llx\n",
+ apic_id, status);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index);
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 04775346369c..2510e91b29b0 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -30,6 +30,7 @@ void __init hv_vtl_init_platform(void)
x86_platform.realmode_init = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_mptable = x86_init_noop;
@@ -174,41 +175,9 @@ free_lock:
return ret;
}
-static int hv_vtl_apicid_to_vp_id(u32 apic_id)
-{
- u64 control;
- u64 status;
- unsigned long irq_flags;
- struct hv_get_vp_from_apic_id_in *input;
- u32 *output, ret;
-
- local_irq_save(irq_flags);
-
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- memset(input, 0, sizeof(*input));
- input->partition_id = HV_PARTITION_ID_SELF;
- input->apic_ids[0] = apic_id;
-
- output = (u32 *)input;
-
- control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
- status = hv_do_hypercall(control, input, output);
- ret = output[0];
-
- local_irq_restore(irq_flags);
-
- if (!hv_result_success(status)) {
- pr_err("failed to get vp id from apic id %d, status %#llx\n",
- apic_id, status);
- return -EINVAL;
- }
-
- return ret;
-}
-
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
- int vp_id, cpu;
+ int vp_index, cpu;
/* Find the logical CPU for the APIC ID */
for_each_present_cpu(cpu) {
@@ -219,18 +188,18 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
return -EINVAL;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
- vp_id = hv_vtl_apicid_to_vp_id(apicid);
+ vp_index = hv_apicid_to_vp_index(apicid);
- if (vp_id < 0) {
+ if (vp_index < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
- if (vp_id > ms_hyperv.max_vp_index) {
- pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
+ if (vp_index > ms_hyperv.max_vp_index) {
+ pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid);
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
+ return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 3215a4a07408..939b7081c5ab 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -192,7 +192,6 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct pci_dev *dev;
struct hv_interrupt_entry out_entry, *stored_entry;
struct irq_cfg *cfg = irqd_cfg(data);
- const cpumask_t *affinity;
int cpu;
u64 status;
@@ -204,8 +203,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- affinity = irq_data_get_effective_affinity_mask(data);
- cpu = cpumask_first_and(affinity, cpu_online_mask);
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
if (data->chip_data) {
/*
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 60fc3ed72830..af87f440bc2a 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -10,6 +10,7 @@
#include <linux/hyperv.h>
#include <linux/types.h>
#include <linux/slab.h>
+#include <linux/cpu.h>
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
@@ -289,7 +290,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip)
{
struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
__get_free_page(GFP_KERNEL | __GFP_ZERO);
@@ -298,10 +299,27 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
u64 ret, retry = 5;
struct hv_enable_vp_vtl *start_vp_input;
unsigned long flags;
+ int cpu, vp_index;
if (!vmsa)
return -ENOMEM;
+ /* Find the Hyper-V VP index which might be not the same as APIC ID */
+ vp_index = hv_apicid_to_vp_index(apic_id);
+ if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index)
+ return -EINVAL;
+
+ /*
+ * Find the Linux CPU number for addressing the per-CPU data, and it
+ * might not be the same as APIC ID.
+ */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apic_id))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
native_store_gdt(&gdtr);
vmsa->gdtr.base = gdtr.address;
@@ -339,7 +357,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
vmsa->sev_features = sev_status >> 2;
ret = snp_set_vmsa(vmsa, true);
- if (!ret) {
+ if (ret) {
pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
free_page((u64)vmsa);
return ret;
@@ -349,7 +367,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
memset(start_vp_input, 0, sizeof(*start_vp_input));
start_vp_input->partition_id = -1;
- start_vp_input->vp_index = cpu;
+ start_vp_input->vp_index = vp_index;
start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
*(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
@@ -465,7 +483,6 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
enum hv_mem_host_visibility visibility)
{
struct hv_gpa_range_for_visibility *input;
- u16 pages_processed;
u64 hv_status;
unsigned long flags;
@@ -494,7 +511,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn));
hv_status = hv_do_rep_hypercall(
HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count,
- 0, input, &pages_processed);
+ 0, input, NULL);
local_irq_restore(flags);
if (hv_result_success(hv_status))
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index ca9ae606aab9..f8dab517de8a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#include <asm/bug.h>
#define ALT_FLAGS_SHIFT 16
@@ -134,6 +135,37 @@ static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
}
#endif
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_init_mod(struct module *mod);
+extern void its_fini_mod(struct module *mod);
+extern void its_free_mod(struct module *mod);
+extern u8 *its_static_thunk(int reg);
+#else /* CONFIG_MITIGATION_ITS */
+static inline void its_init_mod(struct module *mod) { }
+static inline void its_fini_mod(struct module *mod) { }
+static inline void its_free_mod(struct module *mod) { }
+static inline u8 *its_static_thunk(int reg)
+{
+ WARN_ONCE(1, "ITS not compiled in");
+
+ return NULL;
+}
+#endif
+
+#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL)
+extern bool cpu_wants_rethunk(void);
+extern bool cpu_wants_rethunk_at(void *addr);
+#else
+static __always_inline bool cpu_wants_rethunk(void)
+{
+ return false;
+}
+static __always_inline bool cpu_wants_rethunk_at(void *addr)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 806649c7f23d..9a0f29be1a9e 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,8 +22,9 @@
#define SECOND_BYTE_OPCODE_UD2 0x0b
#define BUG_NONE 0xffff
-#define BUG_UD1 0xfffe
-#define BUG_UD2 0xfffd
+#define BUG_UD2 0xfffe
+#define BUG_UD1 0xfffd
+#define BUG_UD1_UBSAN 0xfffc
#ifdef CONFIG_GENERIC_BUG
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 31d19c815f99..7dd5ab239c87 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -126,6 +126,17 @@ static inline int cfi_get_offset(void)
extern u32 cfi_get_func_hash(void *func);
+#ifdef CONFIG_FINEIBT
+extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type);
+#else
+static inline bool
+decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ return false;
+}
+
+#endif
+
#else
static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index aa30fd8cad7f..b6099456477c 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -69,4 +69,16 @@ int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
extern struct cpumask cpus_stop_mask;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 913fd3a7bac6..ef5749a0d8c2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -449,11 +449,13 @@
#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
+#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */
#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* The memory form of VERW mitigates TSA */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
@@ -474,6 +476,11 @@
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
+#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 6) /* Use thunk for indirect branches in lower half of cacheline */
+
+#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */
/*
* BUG word(s)
@@ -525,4 +532,7 @@
#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */
#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */
#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */
+#define X86_BUG_ITS X86_BUG(1*32 + 5) /* "its" CPU is affected by Indirect Target Selection */
+#define X86_BUG_ITS_NATIVE_ONLY X86_BUG(1*32 + 6) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
+#define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index fdbbbfec745a..820b4aeabd0c 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -9,6 +9,14 @@
#include <asm/cpufeature.h>
#include <asm/msr.h>
+/*
+ * Define bits that are always set to 1 in DR7, only bit 10 is
+ * architecturally reserved to '1'.
+ *
+ * This is also the init/reset value for DR7.
+ */
+#define DR7_FIXED_1 0x00000400
+
DECLARE_PER_CPU(unsigned long, cpu_dr7);
#ifndef CONFIG_PARAVIRT_XXL
@@ -100,8 +108,8 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value)
static inline void hw_breakpoint_disable(void)
{
- /* Zero the control register for HW Breakpoint */
- set_debugreg(0UL, 7);
+ /* Reset the control register for HW Breakpoint */
+ set_debugreg(DR7_FIXED_1, 7);
/* Zero-out the individual HW breakpoint address registers */
set_debugreg(0UL, 0);
@@ -125,9 +133,12 @@ static __always_inline unsigned long local_db_save(void)
return 0;
get_debugreg(dr7, 7);
- dr7 &= ~0x400; /* architecturally set bit */
+
+ /* Architecturally set bit */
+ dr7 &= ~DR7_FIXED_1;
if (dr7)
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
+
/*
* Ensure the compiler doesn't lower the above statements into
* the critical section; disabling breakpoints late would not
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index 1e59581d500c..b778ae6e67ee 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -41,7 +41,7 @@
_ASM_PTR fname "\n\t" \
".popsection\n\t"
-static inline __attribute_const__ u32 gen_endbr(void)
+static __always_inline __attribute_const__ u32 gen_endbr(void)
{
u32 endbr;
@@ -56,7 +56,7 @@ static inline __attribute_const__ u32 gen_endbr(void)
return endbr;
}
-static inline __attribute_const__ u32 gen_endbr_poison(void)
+static __always_inline __attribute_const__ u32 gen_endbr_poison(void)
{
/*
* 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 1a42f829667a..c6198fbcc1d7 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -46,6 +46,7 @@
#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
#define INTEL_PENTIUM_PRO IFM(6, 0x01)
+#define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05)
#define INTEL_CORE_YONAH IFM(6, 0x0E)
@@ -115,6 +116,8 @@
#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
+#define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */
+
/* "Hybrid" Processors (P-Core/E-Core) */
#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index cf7fc2b8e3ce..2b75fe80fcb2 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -44,13 +44,13 @@ static __always_inline void native_irq_enable(void)
static __always_inline void native_safe_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
static __always_inline void native_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}
@@ -76,6 +76,28 @@ static __always_inline void native_local_irq_restore(unsigned long flags)
#endif
+#ifndef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static __always_inline void arch_safe_halt(void)
+{
+ native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static __always_inline void halt(void)
+{
+ native_halt();
+}
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
@@ -98,24 +120,6 @@ static __always_inline void arch_local_irq_enable(void)
}
/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static __always_inline void arch_safe_halt(void)
-{
- native_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static __always_inline void halt(void)
-{
- native_halt();
-}
-
-/*
* For spinlocks, etc:
*/
static __always_inline unsigned long arch_local_irq_save(void)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8499b9cb9c82..0caa3293f6db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,6 +31,7 @@
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/msr-index.h>
@@ -246,7 +247,6 @@ enum x86_intercept_stage;
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
-#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
#define KVM_GUESTDBG_VALID_MASK \
@@ -761,6 +761,7 @@ struct kvm_vcpu_arch {
u32 pkru;
u32 hflags;
u64 efer;
+ u64 host_debugctl;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool load_eoi_exitmap_pending;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 695e569159c1..be7cddc414e4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -17,10 +17,12 @@ struct ucode_cpu_info {
void load_ucode_bsp(void);
void load_ucode_ap(void);
void microcode_bsp_resume(void);
+bool __init microcode_loader_disabled(void);
#else
static inline void load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void microcode_bsp_resume(void) { }
+static inline bool __init microcode_loader_disabled(void) { return false; }
#endif
extern unsigned long initrd_start_early;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5f0bc6a6d025..a42439c2ed24 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -275,11 +275,11 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip);
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip);
#else
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; }
+static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip) { return 0; }
#endif
#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
@@ -313,6 +313,7 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
{
return __rdmsr(reg);
}
+int hv_apicid_to_vp_index(u32 apic_id);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -334,6 +335,7 @@ static inline void hv_set_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_msr(unsigned int reg) { return 0; }
static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
+static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 61e991507353..7ebe76f69417 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -209,6 +209,14 @@
* VERW clears CPU Register
* File.
*/
+#define ARCH_CAP_ITS_NO BIT_ULL(62) /*
+ * Not susceptible to
+ * Indirect Target Selection.
+ * This bit is not set by
+ * HW, but is synthesized by
+ * VMMs for guests to know
+ * their affected status.
+ */
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
@@ -613,6 +621,7 @@
#define MSR_AMD64_OSVW_STATUS 0xc0010141
#define MSR_AMD_PPIN_CTL 0xc00102f0
#define MSR_AMD_PPIN 0xc00102f1
+#define MSR_AMD64_CPUID_FN_7 0xc0011002
#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 920426d691ce..7f9a97c572fe 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -44,8 +44,6 @@ static __always_inline void __monitorx(const void *eax, unsigned long ecx,
static __always_inline void __mwait(unsigned long eax, unsigned long ecx)
{
- mds_idle_clear_cpu_buffers();
-
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -80,7 +78,7 @@ static __always_inline void __mwait(unsigned long eax, unsigned long ecx)
static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
unsigned long ecx)
{
- /* No MDS buffer clear as this is AMD/HYGON only */
+ /* No need for TSA buffer clearing on AMD */
/* "mwaitx %eax, %ebx, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xfb;"
@@ -98,7 +96,7 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
*/
static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
- mds_idle_clear_cpu_buffers();
+
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -116,24 +114,29 @@ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
*/
static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
- if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
- mb();
- clflush((void *)&current_thread_info()->flags);
- mb();
- }
+ const void *addr = &current_thread_info()->flags;
+
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
- __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (need_resched())
+ goto out;
- if (!need_resched()) {
- if (ecx & 1) {
- __mwait(eax, ecx);
- } else {
- __sti_mwait(eax, ecx);
- raw_local_irq_disable();
- }
+ if (ecx & 1) {
+ __mwait(eax, ecx);
+ } else {
+ __sti_mwait(eax, ecx);
+ raw_local_irq_disable();
}
}
+
+out:
current_clr_polling();
}
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 41a0ebb699ec..f677382093f3 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -56,6 +56,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *);
void unregister_nmi_handler(unsigned int, const char *);
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler);
+
void stop_nmi(void);
void restart_nmi(void);
void local_touch_nmi(void);
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 96b410b1d4e8..331f6a05535d 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -210,9 +210,8 @@
.endm
/*
- * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
- * to the retpoline thunk with a CS prefix when the register requires
- * a RAX prefix byte to encode. Also see apply_retpolines().
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
*/
.macro __CS_PREFIX reg:req
.irp rs,r8,r9,r10,r11,r12,r13,r14,r15
@@ -316,25 +315,31 @@
.endm
/*
- * Macro to execute VERW instruction that mitigate transient data sampling
- * attacks such as MDS. On affected systems a microcode update overloaded VERW
- * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
- *
+ * Macro to execute VERW insns that mitigate transient data sampling
+ * attacks such as MDS or TSA. On affected systems a microcode update
+ * overloaded VERW insns to also clear the CPU buffers. VERW clobbers
+ * CFLAGS.ZF.
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
-.macro CLEAR_CPU_BUFFERS
+.macro __CLEAR_CPU_BUFFERS feature
#ifdef CONFIG_X86_64
- ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF
+ ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature
#else
/*
* In 32bit mode, the memory operand must be a %cs reference. The data
* segments may not be usable (vm86 mode), and the stack segment may not
* be flat (ESPFIX32).
*/
- ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF
+ ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature
#endif
.endm
+#define CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF
+
+#define VM_CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM
+
#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
@@ -356,10 +361,14 @@
".long 999b\n\t" \
".popsection\n\t"
+#define ITS_THUNK_SIZE 64
+
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+typedef u8 its_thunk_t[ITS_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
+extern its_thunk_t __x86_indirect_its_thunk_array[];
#ifdef CONFIG_MITIGATION_RETHUNK
extern void __x86_return_thunk(void);
@@ -383,6 +392,12 @@ static inline void srso_return_thunk(void) {}
static inline void srso_alias_return_thunk(void) {}
#endif
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_return_thunk(void);
+#else
+static inline void its_return_thunk(void) {}
+#endif
+
extern void retbleed_return_thunk(void);
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
@@ -439,19 +454,22 @@ static inline void call_depth_return_thunk(void) {}
#ifdef CONFIG_X86_64
/*
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
+ */
+#define __CS_PREFIX(reg) \
+ ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\rs," reg "\n" \
+ ".byte 0x2e\n" \
+ ".endif\n" \
+ ".endr\n"
+
+/*
* Inline asm uses the %V modifier which is only in newer GCC
* which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
*/
-# define CALL_NOSPEC \
- ALTERNATIVE_2( \
- ANNOTATE_RETPOLINE_SAFE \
- "call *%[thunk_target]\n", \
- "call __x86_indirect_thunk_%V[thunk_target]\n", \
- X86_FEATURE_RETPOLINE, \
- "lfence;\n" \
- ANNOTATE_RETPOLINE_SAFE \
- "call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_LFENCE)
+#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \
+ "call __x86_indirect_thunk_%V[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
@@ -570,24 +588,24 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
-extern u16 mds_verw_sel;
+extern u16 x86_verw_sel;
#include <asm/segment.h>
/**
- * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns
*
* This uses the otherwise unused and obsolete VERW instruction in
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
-static __always_inline void mds_clear_cpu_buffers(void)
+static __always_inline void x86_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
@@ -604,14 +622,15 @@ static __always_inline void mds_clear_cpu_buffers(void)
}
/**
- * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
+ * and TSA vulnerabilities.
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static __always_inline void mds_idle_clear_cpu_buffers(void)
+static __always_inline void x86_idle_clear_cpu_buffers(void)
{
- if (static_branch_likely(&mds_idle_clear))
- mds_clear_cpu_buffers();
+ if (static_branch_likely(&cpu_buf_idle_clear))
+ x86_clear_cpu_buffers();
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d4eb9e1d61b8..75d4c994f5e2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -107,6 +107,16 @@ static inline void notify_page_enc_status_changed(unsigned long pfn,
PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
}
+static __always_inline void arch_safe_halt(void)
+{
+ PVOP_VCALL0(irq.safe_halt);
+}
+
+static inline void halt(void)
+{
+ PVOP_VCALL0(irq.halt);
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@@ -170,16 +180,6 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}
-static __always_inline void arch_safe_halt(void)
-{
- PVOP_VCALL0(irq.safe_halt);
-}
-
-static inline void halt(void)
-{
- PVOP_VCALL0(irq.halt);
-}
-
extern noinstr void pv_native_wbinvd(void);
static __always_inline void wbinvd(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8d4fbe1be489..9334fdd1f635 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -122,10 +122,9 @@ struct pv_irq_ops {
struct paravirt_callee_save save_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
-
+#endif
void (*safe_halt)(void);
void (*halt)(void);
-#endif
} __no_randomize_layout;
struct pv_mmu_ops {
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index c55a79d5feae..2d9c250b3c8d 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -349,9 +349,9 @@ do { \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
- : [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), \
+ "+d" (old__.high)) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
@@ -380,10 +380,10 @@ do { \
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
CC_SET(z) \
- : CC_OUT(z) (success), \
- [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
+ : ALT_OUTPUT_SP(CC_OUT(z) (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), \
+ "+d" (old__.high)) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
@@ -420,9 +420,9 @@ do { \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
- : [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
+ : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), \
+ "+d" (old__.high)) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
@@ -451,10 +451,10 @@ do { \
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
CC_SET(z) \
- : CC_OUT(z) (success), \
- [var] "+m" (__my_cpu_var(_var)), \
- "+a" (old__.low), \
- "+d" (old__.high) \
+ : ALT_OUTPUT_SP(CC_OUT(z) (success), \
+ [var] "+m" (__my_cpu_var(_var)), \
+ "+a" (old__.low), \
+ "+d" (old__.high)) \
: "b" (new__.low), \
"c" (new__.high), \
"S" (&(_var)) \
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7505bb5d260a..aa351c4a20ee 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -520,6 +520,7 @@ struct pebs_xmm {
*/
#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
+#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52)
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 98726c2b04f8..ddaf3c62efb5 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -116,7 +116,7 @@ enum psc_op {
#define GHCB_MSR_VMPL_REQ 0x016
#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
/* GHCBData[39:32] */ \
- (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \
+ ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \
/* GHCBDdata[11:0] */ \
GHCB_MSR_VMPL_REQ)
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index e770c4fc47f4..8727c7e21dd1 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+/*
+ * To prevent immediate repeat of single step trap on return from SIGTRAP
+ * handler if the trap flag (TF) is set without an external debugger attached,
+ * clear the software event flag in the augmented SS, ensuring no single-step
+ * trap is pending upon ERETU completion.
+ *
+ * Note, this function should be called in sigreturn() before the original
+ * state is restored to make sure the TF is read from the entry frame.
+ */
+static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs)
+{
+ /*
+ * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction
+ * is being single-stepped, do not clear the software event flag in the
+ * augmented SS, thus a debugger won't skip over the following instruction.
+ */
+#ifdef CONFIG_X86_FRED
+ if (!(regs->flags & X86_EFLAGS_TF))
+ regs->fred_ss.swevent = 0;
+#endif
+}
+
#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index eba178996d84..2d13ef1f4b05 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
-void tdx_safe_halt(void);
+void tdx_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
@@ -69,7 +69,7 @@ u64 tdx_hcall_get_quote(u8 *buf, size_t size);
#else
static inline void tdx_early_init(void) { };
-static inline void tdx_safe_halt(void) { };
+static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
@@ -97,7 +97,7 @@ void tdx_init(void);
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
-static inline u64 sc_retry(sc_func_t func, u64 fn,
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
int retry = RDRAND_RETRY_LOOPS;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 02fc2aa06e9e..3da645139748 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -242,7 +242,7 @@ void flush_tlb_multi(const struct cpumask *cpumask,
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
- : PAGE_SHIFT, false)
+ : PAGE_SHIFT, true)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 9b82eebd7add..dafbf581c515 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -26,7 +26,7 @@
#define XLF_5LEVEL_ENABLED (1<<6)
#define XLF_MEM_ENCRYPTION (1<<7)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/screen_info.h>
@@ -210,6 +210,6 @@ enum x86_hardware_subarch {
X86_NR_SUBARCHS,
};
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index 0007ba077c0c..41da492dfb01 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -15,7 +15,26 @@
which debugging register was responsible for the trap. The other bits
are either reserved or not of interest to us. */
-/* Define reserved bits in DR6 which are always set to 1 */
+/*
+ * Define bits in DR6 which are set to 1 by default.
+ *
+ * This is also the DR6 architectural value following Power-up, Reset or INIT.
+ *
+ * Note, with the introduction of Bus Lock Detection (BLD) and Restricted
+ * Transactional Memory (RTM), the DR6 register has been modified:
+ *
+ * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports
+ * Bus Lock Detection. The assertion of a bus lock could clear it.
+ *
+ * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports
+ * restricted transactional memory. #DB occurred inside an RTM region
+ * could clear it.
+ *
+ * Apparently, DR6.BLD and DR6.RTM are active low bits.
+ *
+ * As a result, DR6_RESERVED is an incorrect name now, but it is kept for
+ * compatibility.
+ */
#define DR6_RESERVED (0xFFFF0FF0)
#define DR_TRAP0 (0x1) /* db0 */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 2f491efe3a12..55bc66867156 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -54,7 +54,7 @@
*/
#define E820_RESERVED_KERN 128
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
struct e820entry {
__u64 addr; /* start of memory segment */
@@ -76,7 +76,7 @@ struct e820map {
#define BIOS_ROM_BASE 0xffe00000
#define BIOS_ROM_END 0xffffffff
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_E820_H */
diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
index d62ac5db093b..a82c039d8e6a 100644
--- a/arch/x86/include/uapi/asm/ldt.h
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -12,7 +12,7 @@
/* The size of each LDT entry. */
#define LDT_ENTRY_SIZE 8
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
* not to the default values if you still want to do syscalls. This
@@ -44,5 +44,5 @@ struct user_desc {
#define MODIFY_LDT_CONTENTS_STACK 1
#define MODIFY_LDT_CONTENTS_CODE 2
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
index e7516b402a00..4b8917ca28fe 100644
--- a/arch/x86/include/uapi/asm/msr.h
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -2,7 +2,7 @@
#ifndef _UAPI_ASM_X86_MSR_H
#define _UAPI_ASM_X86_MSR_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/ioctl.h>
@@ -10,5 +10,5 @@
#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8])
#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8])
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_MSR_H */
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 16074b9c93bb..5823584dea13 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,7 +25,7 @@
#else /* __i386__ */
-#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS)
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
@@ -57,7 +57,7 @@
#define EFLAGS 144
#define RSP 152
#define SS 160
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
/* top of stack page */
#define FRAME_SIZE 168
@@ -87,7 +87,7 @@
#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#endif
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index 85165c0edafc..e0b5b4f6226b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -7,7 +7,7 @@
#include <asm/processor-flags.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef __i386__
/* this struct defines the way the registers are stored on the
@@ -81,6 +81,6 @@ struct pt_regs {
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index b111b0c18544..50c45ead4e7c 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -18,7 +18,7 @@
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
@@ -78,6 +78,6 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index f777346450ec..1067efabf18b 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -2,7 +2,7 @@
#ifndef _UAPI_ASM_X86_SIGNAL_H
#define _UAPI_ASM_X86_SIGNAL_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/compiler.h>
@@ -16,7 +16,7 @@ struct siginfo;
typedef unsigned long sigset_t;
#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define SIGHUP 1
@@ -68,7 +68,7 @@ typedef unsigned long sigset_t;
#include <asm-generic/signal-defs.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
# ifndef __KERNEL__
@@ -106,6 +106,6 @@ typedef struct sigaltstack {
__kernel_size_t ss_size;
} stack_t;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SIGNAL_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c70b86f1f295..63adda8a143f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -23,6 +23,8 @@
#include <linux/serial_core.h>
#include <linux/pgtable.h>
+#include <xen/xen.h>
+
#include <asm/e820/api.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
@@ -1730,6 +1732,15 @@ int __init acpi_mps_check(void)
{
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
+
+ /*
+ * Xen disables ACPI in PV DomU guests but it still emulates APIC and
+ * supports SMP. Returning early here ensures that APIC is not disabled
+ * unnecessarily and the guest is not limited to a single vCPU.
+ */
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
if (acpi_disabled || acpi_noirq) {
pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d17518ca19b8..6ab96bc764cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,6 +18,7 @@
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <linux/sync_core.h>
+#include <linux/execmem.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -31,6 +32,8 @@
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
#include <asm/cfi.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
int __read_mostly alternatives_patched;
@@ -124,6 +127,136 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
#endif
};
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
+#endif
+static void *its_page;
+static unsigned int its_offset;
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int i = 0;
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* jmp *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk;
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ set_memory_rox((unsigned long)page, 1);
+ }
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ execmem_free(page);
+ }
+ kfree(mod->its_page_array);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+ void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+
+ if (!page)
+ return NULL;
+
+#ifdef CONFIG_MODULES
+ if (its_mod) {
+ void *tmp = krealloc(its_mod->its_page_array,
+ (its_mod->its_num_pages+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ its_mod->its_page_array = tmp;
+ its_mod->its_page_array[its_mod->its_num_pages++] = page;
+ }
+#endif /* CONFIG_MODULES */
+
+ return no_free_ptr(page);
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ set_memory_rw((unsigned long)its_page, 1);
+ thunk = its_init_thunk(thunk, reg);
+ set_memory_rox((unsigned long)its_page, 1);
+
+ return thunk;
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+ return thunk;
+}
+
+#endif
+
/*
* Nomenclature for variable names to simplify and clarify this code and ease
* any potential staring at it:
@@ -581,7 +714,8 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
-static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
{
u8 op = insn->opcode.bytes[0];
int i = 0;
@@ -602,7 +736,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
switch (op) {
case CALL_INSN_OPCODE:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_call_thunk_array[reg],
+ call_dest,
CALL_INSN_SIZE);
i += CALL_INSN_SIZE;
break;
@@ -610,7 +744,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8
case JMP32_INSN_OPCODE:
clang_jcc:
__text_gen_insn(bytes+i, op, addr+i,
- __x86_indirect_jump_thunk_array[reg],
+ jmp_dest,
JMP32_INSN_SIZE);
i += JMP32_INSN_SIZE;
break;
@@ -625,6 +759,39 @@ clang_jcc:
return i;
}
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#endif
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -699,6 +866,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
bytes[i++] = 0xe8; /* LFENCE */
}
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
ret = emit_indirect(op, reg, bytes + i);
if (ret < 0)
return ret;
@@ -770,6 +946,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
#ifdef CONFIG_MITIGATION_RETHUNK
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -786,7 +977,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
int i = 0;
/* Patch the custom return thunks... */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (cpu_wants_rethunk_at(addr)) {
i = JMP32_INSN_SIZE;
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
} else {
@@ -803,7 +994,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk())
static_call_force_reinit();
for (s = start; s < end; s++) {
@@ -1063,6 +1254,7 @@ asm( ".pushsection .rodata \n"
" endbr64 \n"
" subl $0x12345678, %r10d \n"
" je fineibt_preamble_end \n"
+ "fineibt_preamble_ud2: \n"
" ud2 \n"
" nop \n"
"fineibt_preamble_end: \n"
@@ -1070,9 +1262,11 @@ asm( ".pushsection .rodata \n"
);
extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_ud2[];
extern u8 fineibt_preamble_end[];
#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_ud2 (fineibt_preamble_ud2 - fineibt_preamble_start)
#define fineibt_preamble_hash 7
asm( ".pushsection .rodata \n"
@@ -1377,6 +1571,33 @@ static void poison_cfi(void *addr)
}
}
+/*
+ * regs->ip points to a UD2 instruction, return true and fill out target and
+ * type when this UD2 is from a FineIBT preamble.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * UD2 instruction.
+ */
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud2;
+ u32 endbr, hash;
+
+ __get_kernel_nofault(&endbr, addr, u32, Efault);
+ if (endbr != gen_endbr())
+ return false;
+
+ *target = addr + fineibt_preamble_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->r10 + hash;
+
+ return true;
+
+Efault:
+ return false;
+}
+
#else
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
@@ -1665,6 +1886,8 @@ static noinline void __init alt_reloc_selftest(void)
void __init alternative_instructions(void)
{
+ u64 ibt;
+
int3_selftest();
/*
@@ -1691,6 +1914,9 @@ void __init alternative_instructions(void)
*/
paravirt_set_cap();
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
+
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
__cfi_sites, __cfi_sites_end, true);
@@ -1714,6 +1940,8 @@ void __init alternative_instructions(void)
*/
apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ ibt_restore(ibt);
+
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 37b8244899d8..04712fd0c964 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -405,7 +405,6 @@ bool __init early_is_amd_nb(u32 device)
struct resource *amd_get_mmconfig_range(struct resource *res)
{
- u32 address;
u64 base, msr;
unsigned int segn_busn_bits;
@@ -413,13 +412,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return NULL;
- /* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */
+ if (boot_cpu_data.x86 < 0x10 ||
+ rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr))
return NULL;
- address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, msr);
-
/* mmconfig is not enabled */
if (!(msr & FAM10H_MMIO_CONF_ENABLE))
return NULL;
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index e6bf78fac146..f6905bef0af8 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -70,11 +70,25 @@ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
unsigned long target;
u32 type;
- if (!is_cfi_trap(regs->ip))
- return BUG_TRAP_TYPE_NONE;
+ switch (cfi_mode) {
+ case CFI_KCFI:
+ if (!is_cfi_trap(regs->ip))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, regs->ip);
+
+ break;
- if (!decode_cfi_insn(regs, &target, &type))
- return report_cfi_failure_noaddr(regs, regs->ip);
+ case CFI_FINEIBT:
+ if (!decode_fineibt_insn(regs, &target, &type))
+ return BUG_TRAP_TYPE_NONE;
+
+ break;
+
+ default:
+ return BUG_TRAP_TYPE_NONE;
+ }
return report_cfi_failure(regs, regs->ip, &target, type);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 79d2e17f6582..4810271302d0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -368,6 +368,66 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
#endif
}
+static bool amd_check_tsa_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ union zen_patch_rev p;
+ u32 min_rev = 0;
+
+ p.ext_fam = c->x86 - 0xf;
+ p.model = c->x86_model;
+ p.ext_model = c->x86_model >> 4;
+ p.stepping = c->x86_stepping;
+ /* reserved bits are expected to be 0 in test below */
+ p.__reserved = 0;
+
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ switch (p.ucode_rev >> 8) {
+ case 0xa0011: min_rev = 0x0a0011d7; break;
+ case 0xa0012: min_rev = 0x0a00123b; break;
+ case 0xa0082: min_rev = 0x0a00820d; break;
+ case 0xa1011: min_rev = 0x0a10114c; break;
+ case 0xa1012: min_rev = 0x0a10124c; break;
+ case 0xa1081: min_rev = 0x0a108109; break;
+ case 0xa2010: min_rev = 0x0a20102e; break;
+ case 0xa2012: min_rev = 0x0a201211; break;
+ case 0xa4041: min_rev = 0x0a404108; break;
+ case 0xa5000: min_rev = 0x0a500012; break;
+ case 0xa6012: min_rev = 0x0a60120a; break;
+ case 0xa7041: min_rev = 0x0a704108; break;
+ case 0xa7052: min_rev = 0x0a705208; break;
+ case 0xa7080: min_rev = 0x0a708008; break;
+ case 0xa70c0: min_rev = 0x0a70c008; break;
+ case 0xaa002: min_rev = 0x0aa00216; break;
+ default:
+ pr_debug("%s: ucode_rev: 0x%x, current revision: 0x%x\n",
+ __func__, p.ucode_rev, c->microcode);
+ return false;
+ }
+ }
+
+ if (!min_rev)
+ return false;
+
+ return c->microcode >= min_rev;
+}
+
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ if (amd_check_tsa_microcode())
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+ }
+}
+
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -475,6 +535,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
bsp_determine_snp(c);
+
+ tsa_init(c);
+
return;
warn:
@@ -627,7 +690,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
if (!rdmsrl_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
@@ -862,6 +925,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c)
pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
setup_force_cpu_bug(X86_BUG_DIV0);
+
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
+ }
}
static bool cpu_has_zenbleed_microcode(void)
@@ -906,6 +979,13 @@ static void init_amd_zen2(struct cpuinfo_x86 *c)
init_spectral_chicken(c);
fix_erratum_1386(c);
zen2_zenbleed_check(c);
+
+ /* Disable RDSEED on AMD Cyan Skillfish because of an error. */
+ if (c->x86_model == 0x47 && c->x86_stepping == 0x0) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg("RDSEED is not reliable on this platform; disabling.\n");
+ }
}
static void init_amd_zen3(struct cpuinfo_x86 *c)
@@ -1045,13 +1125,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
- /*
- * Turn on the Instructions Retired free counter on machines not
- * susceptible to erratum #1054 "Instructions Retired Performance
- * Counter May Be Inaccurate".
- */
- if (cpu_has(c, X86_FEATURE_IRPERF) &&
- (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5fba44a4f988..c2c7b76d953f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -49,6 +49,8 @@ static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
static void __init srso_select_mitigation(void);
static void __init gds_select_mitigation(void);
+static void __init its_select_mitigation(void);
+static void __init tsa_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -67,6 +69,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex);
void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+static void __init set_return_thunk(void *thunk)
+{
+ if (x86_return_thunk != __x86_return_thunk)
+ pr_warn("x86/bugs: return thunk changed\n");
+
+ x86_return_thunk = thunk;
+}
+
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
{
@@ -113,9 +123,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-/* Control MDS CPU buffer clear before idling (halt, mwait) */
-DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
-EXPORT_SYMBOL_GPL(mds_idle_clear);
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
/*
* Controls whether l1d flush based mitigations are enabled,
@@ -175,6 +185,8 @@ void __init cpu_select_mitigations(void)
*/
srso_select_mitigation();
gds_select_mitigation();
+ its_select_mitigation();
+ tsa_select_mitigation();
}
/*
@@ -438,7 +450,7 @@ static void __init mmio_select_mitigation(void)
* is required irrespective of SMT state.
*/
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
/*
* Check if the system has the right microcode.
@@ -1104,7 +1116,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
- x86_return_thunk = retbleed_return_thunk;
+ set_return_thunk(retbleed_return_thunk);
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1139,7 +1151,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
- x86_return_thunk = call_depth_return_thunk;
+ set_return_thunk(call_depth_return_thunk);
break;
default:
@@ -1174,6 +1186,145 @@ do_cmd_auto:
}
#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+enum its_mitigation_cmd {
+ ITS_CMD_OFF,
+ ITS_CMD_ON,
+ ITS_CMD_VMEXIT,
+ ITS_CMD_RSB_STUFF,
+};
+
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS;
+
+static enum its_mitigation_cmd its_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF;
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_cmd = ITS_CMD_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_cmd = ITS_CMD_ON;
+ } else if (!strcmp(str, "force")) {
+ its_cmd = ITS_CMD_ON;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_cmd = ITS_CMD_VMEXIT;
+ } else if (!strcmp(str, "stuff")) {
+ its_cmd = ITS_CMD_RSB_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ enum its_mitigation_cmd cmd = its_cmd;
+
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ /* Retpoline+CDT mitigates ITS, bail out */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ boot_cpu_has(X86_FEATURE_CALL_DEPTH)) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ goto out;
+ }
+
+ /* Exit early to avoid irrelevant warnings */
+ if (cmd == ITS_CMD_OFF) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (spectre_v2_enabled == SPECTRE_V2_NONE) {
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+
+ if (cmd == ITS_CMD_RSB_STUFF &&
+ (!boot_cpu_has(X86_FEATURE_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ cmd = ITS_CMD_ON;
+ }
+
+ switch (cmd) {
+ case ITS_CMD_OFF:
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case ITS_CMD_VMEXIT:
+ if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ goto out;
+ }
+ fallthrough;
+ case ITS_CMD_ON:
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_CMD_RSB_STUFF:
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ pr_info("Retbleed mitigation updated to stuffing\n");
+ }
+ break;
+ }
+out:
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
@@ -1293,9 +1444,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
static enum spectre_v2_user_cmd __init
spectre_v2_parse_user_cmdline(void)
{
+ enum spectre_v2_user_cmd mode;
char arg[20];
int ret, i;
+ mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?
+ SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
@@ -1308,7 +1463,7 @@ spectre_v2_parse_user_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_USER_CMD_AUTO;
+ return mode;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
@@ -1318,8 +1473,8 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_USER_CMD_AUTO;
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
+ return mode;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
@@ -1578,7 +1733,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
rrsba_disabled = true;
}
-static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
{
/*
* Similar to context switches, there are two types of RSB attacks
@@ -1602,27 +1757,30 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
*/
switch (mode) {
case SPECTRE_V2_NONE:
- return;
+ break;
- case SPECTRE_V2_EIBRS_LFENCE:
case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
}
- return;
+ break;
- case SPECTRE_V2_EIBRS_RETPOLINE:
case SPECTRE_V2_RETPOLINE:
case SPECTRE_V2_LFENCE:
case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
- return;
- }
+ break;
- pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
- dump_stack();
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
@@ -1681,11 +1839,11 @@ static void __init bhi_select_mitigation(void)
return;
}
- /* Mitigate in hardware if supported */
- if (spec_ctrl_bhi_dis())
+ if (!IS_ENABLED(CONFIG_X86_64))
return;
- if (!IS_ENABLED(CONFIG_X86_64))
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
return;
if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
@@ -1854,10 +2012,7 @@ static void __init spectre_v2_select_mitigation(void)
*
* FIXME: Is this pointless for retbleed-affected AMD?
*/
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
-
- spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+ spectre_v2_select_rsb_mitigation(mode);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
@@ -1939,10 +2094,10 @@ static void update_mds_branch_idle(void)
return;
if (sched_smt_active()) {
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
- static_branch_disable(&mds_idle_clear);
+ static_branch_disable(&cpu_buf_idle_clear);
}
}
@@ -1950,6 +2105,94 @@ static void update_mds_branch_idle(void)
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_FULL : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (cpu_mitigations_off() || !boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ break;
+
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+
+ case TSA_MITIGATION_UCODE_NEEDED:
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ goto out;
+
+ pr_notice("Forcing mitigation on in a VM\n");
+
+ /*
+ * On the off-chance that microcode has been updated
+ * on the host, enable the mitigation in the guest just
+ * in case.
+ */
+ fallthrough;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ default:
+ break;
+ }
+
+out:
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
void cpu_bugs_smt_update(void)
{
mutex_lock(&spec_ctrl_mutex);
@@ -2003,6 +2246,24 @@ void cpu_bugs_smt_update(void)
break;
}
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_FULL:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -2624,10 +2885,10 @@ static void __init srso_select_mitigation(void)
if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- x86_return_thunk = srso_alias_return_thunk;
+ set_return_thunk(srso_alias_return_thunk);
} else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
- x86_return_thunk = srso_return_thunk;
+ set_return_thunk(srso_return_thunk);
}
if (has_microcode)
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
@@ -2802,6 +3063,11 @@ static ssize_t rfds_show_state(char *buf)
return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
}
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2926,6 +3192,11 @@ static ssize_t gds_show_state(char *buf)
return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
}
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -2984,6 +3255,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_RFDS:
return rfds_show_state(buf);
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
default:
break;
}
@@ -3063,6 +3340,16 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib
{
return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
#endif
void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index e6fa03ed9172..a6c6bccfa8b8 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -808,7 +808,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f439763f45ae..976545ec8fdc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1007,17 +1007,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
- /* AMD-defined flags: level 0x80000001 */
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
eax = cpuid_eax(0x80000000);
- c->extended_cpuid_level = eax;
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
- if ((eax & 0xffff0000) == 0x80000000) {
- if (eax >= 0x80000001) {
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_8000_0001_ECX] = ecx;
- c->x86_capability[CPUID_8000_0001_EDX] = edx;
- }
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
if (c->extended_cpuid_level >= 0x80000007) {
@@ -1228,6 +1229,12 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define GDS BIT(6)
/* CPU is affected by Register File Data Sampling */
#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1239,22 +1246,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x0, 0x5), MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPINGS(0x0, 0xb), MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPINGS(0x0, 0xc), MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS),
VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED | ITS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY),
VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
@@ -1272,7 +1282,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
- VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x19, SRSO | TSA),
{}
};
@@ -1318,6 +1328,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
return cpu_matches(cpu_vuln_blacklist, RFDS);
}
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
@@ -1437,9 +1473,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (vulnerable_to_rfds(x86_arch_cap_msr))
setup_force_cpu_bug(X86_BUG_RFDS);
- /* When virtualized, eIBRS could be hidden, assume vulnerable */
- if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
- !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
(boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
boot_cpu_has(X86_FEATURE_HYPERVISOR)))
setup_force_cpu_bug(X86_BUG_BHI);
@@ -1447,6 +1486,22 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -2102,20 +2157,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif /* CONFIG_X86_64 */
-/*
- * Clear all 6 debug registers:
- */
-static void clear_all_debug_regs(void)
+static void initialize_debug_regs(void)
{
- int i;
-
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
-
- set_debugreg(0, i);
- }
+ /* Control register first -- to make sure everything is disabled. */
+ set_debugreg(DR7_FIXED_1, 7);
+ set_debugreg(DR6_RESERVED, 6);
+ /* dr5 and dr4 don't exist */
+ set_debugreg(0, 3);
+ set_debugreg(0, 2);
+ set_debugreg(0, 1);
+ set_debugreg(0, 0);
}
#ifdef CONFIG_KGDB
@@ -2276,7 +2327,7 @@ void cpu_init(void)
load_mm_ldt(&init_mm);
- clear_all_debug_regs();
+ initialize_debug_regs();
dbg_restore_debug_regs();
doublefault_init_cpu_tss();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 4b5f3d052151..b93d88ec1417 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -672,26 +672,37 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
-#define TLB_DATA_1G 0x16
+#define TLB_DATA_1G 0x16
+#define TLB_DATA_1G_2M_4M 0x17
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
-#define STLB_4K 0x41
-#define STLB_4K_2M 0x42
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
+
+/*
+ * All of leaf 0x2's one-byte TLB descriptors implies the same number of
+ * entries for their respective TLB types. The 0x63 descriptor is an
+ * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries
+ * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for
+ * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the
+ * intel_tlb_table[] mapping.
+ */
+#define TLB_0x63_2M_4M_ENTRIES 32
static const struct _tlb_table intel_tlb_table[] = {
{ 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -713,7 +724,8 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative"
+ " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" },
{ 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
{ 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
{ 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
@@ -813,6 +825,12 @@ static void intel_tlb_lookup(const unsigned char desc)
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
+ case TLB_DATA_1G_2M_4M:
+ if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
+ tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
+ if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
+ tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
+ fallthrough;
case TLB_DATA_1G:
if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
@@ -836,7 +854,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 14bf8c232e45..dac4564e1d7c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -327,7 +327,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
struct thresh_restart {
struct threshold_block *b;
- int reset;
int set_lvt_off;
int lvt_off;
u16 old_limit;
@@ -422,13 +421,13 @@ static void threshold_restart_bank(void *_tr)
rdmsr(tr->b->address, lo, hi);
- if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- hi =
- (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
@@ -1099,13 +1098,20 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
}
bank_type = smca_get_bank_type(cpu, bank);
- if (bank_type >= N_SMCA_BANK_TYPES)
- return NULL;
if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
- return NULL;
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
}
if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2a938f429c4d..d8f3d9af8acf 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1688,6 +1688,11 @@ static void mc_poll_banks_default(void)
void (*mc_poll_banks)(void) = mc_poll_banks_default;
+static bool should_enable_timer(unsigned long iv)
+{
+ return !mca_cfg.ignore_ce && iv;
+}
+
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1711,7 +1716,7 @@ static void mce_timer_fn(struct timer_list *t)
if (mce_get_storm_mode()) {
__start_timer(t, HZ);
- } else {
+ } else if (should_enable_timer(iv)) {
__this_cpu_write(mce_next_interval, iv);
__start_timer(t, iv);
}
@@ -2111,11 +2116,10 @@ static void mce_start_timer(struct timer_list *t)
{
unsigned long iv = check_interval * HZ;
- if (mca_cfg.ignore_ce || !iv)
- return;
-
- this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (should_enable_timer(iv)) {
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
static void __mcheck_cpu_setup_timer(void)
@@ -2756,15 +2760,9 @@ static int mce_cpu_dead(unsigned int cpu)
static int mce_cpu_online(unsigned int cpu)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- int ret;
mce_device_create(cpu);
-
- ret = mce_threshold_create_device(cpu);
- if (ret) {
- mce_device_remove(cpu);
- return ret;
- }
+ mce_threshold_create_device(cpu);
mce_reenable_cpu();
mce_start_timer(t);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f6103e6bf69a..bb0a60b1ed63 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -477,6 +477,7 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c)
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
+ cmci_clear();
}
bool intel_filter_mce(struct mce *m)
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index f5365b32582a..765b4646648f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -94,18 +94,6 @@ static struct equiv_cpu_table {
struct equiv_cpu_entry *entry;
} equiv_table;
-union zen_patch_rev {
- struct {
- __u32 rev : 8,
- stepping : 4,
- model : 4,
- __reserved : 4,
- ext_model : 4,
- ext_fam : 8;
- };
- __u32 ucode_rev;
-};
-
union cpuid_1_eax {
struct {
__u32 stepping : 4,
@@ -175,24 +163,36 @@ static bool need_sha_check(u32 cur_rev)
{
switch (cur_rev >> 8) {
case 0x80012: return cur_rev <= 0x800126f; break;
+ case 0x80082: return cur_rev <= 0x800820f; break;
case 0x83010: return cur_rev <= 0x830107c; break;
case 0x86001: return cur_rev <= 0x860010e; break;
case 0x86081: return cur_rev <= 0x8608108; break;
case 0x87010: return cur_rev <= 0x8701034; break;
case 0x8a000: return cur_rev <= 0x8a0000a; break;
+ case 0xa0010: return cur_rev <= 0xa00107a; break;
case 0xa0011: return cur_rev <= 0xa0011da; break;
case 0xa0012: return cur_rev <= 0xa001243; break;
+ case 0xa0082: return cur_rev <= 0xa00820e; break;
case 0xa1011: return cur_rev <= 0xa101153; break;
case 0xa1012: return cur_rev <= 0xa10124e; break;
case 0xa1081: return cur_rev <= 0xa108109; break;
case 0xa2010: return cur_rev <= 0xa20102f; break;
case 0xa2012: return cur_rev <= 0xa201212; break;
+ case 0xa4041: return cur_rev <= 0xa404109; break;
+ case 0xa5000: return cur_rev <= 0xa500013; break;
case 0xa6012: return cur_rev <= 0xa60120a; break;
case 0xa7041: return cur_rev <= 0xa704109; break;
case 0xa7052: return cur_rev <= 0xa705208; break;
case 0xa7080: return cur_rev <= 0xa708009; break;
case 0xa70c0: return cur_rev <= 0xa70C009; break;
+ case 0xaa001: return cur_rev <= 0xaa00116; break;
case 0xaa002: return cur_rev <= 0xaa00218; break;
+ case 0xb0021: return cur_rev <= 0xb002146; break;
+ case 0xb1010: return cur_rev <= 0xb101046; break;
+ case 0xb2040: return cur_rev <= 0xb204031; break;
+ case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb6000: return cur_rev <= 0xb600031; break;
+ case 0xb7000: return cur_rev <= 0xb700031; break;
default: break;
}
@@ -208,8 +208,7 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi
struct sha256_state s;
int i;
- if (x86_family(bsp_cpuid_1_eax) < 0x17 ||
- x86_family(bsp_cpuid_1_eax) > 0x19)
+ if (x86_family(bsp_cpuid_1_eax) < 0x17)
return true;
if (!need_sha_check(cur_rev))
@@ -594,7 +593,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
- return -1;
+ return false;
native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr);
@@ -1069,7 +1068,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
if (ret != UCODE_OK)
return ret;
- for_each_node(nid) {
+ for_each_node_with_cpus(nid) {
cpu = cpumask_first(cpumask_of_node(nid));
c = &cpu_data(cpu);
@@ -1088,15 +1087,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
static int __init save_microcode_in_initrd(void)
{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
struct cpuinfo_x86 *c = &boot_cpu_data;
struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
enum ucode_state ret;
struct cpio_data cp;
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
return 0;
+ cpuid_1_eax = native_cpuid_eax(1);
+
if (!find_blobs_in_containers(&cp))
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
index 2a1655b1fdd8..1fd349cfc802 100644
--- a/arch/x86/kernel/cpu/microcode/amd_shas.c
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -231,6 +231,13 @@ static const struct patch_digest phashes[] = {
0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
}
},
+ { 0xa0011d7, {
+ 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b,
+ 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12,
+ 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2,
+ 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36,
+ }
+ },
{ 0xa001223, {
0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
@@ -294,6 +301,13 @@ static const struct patch_digest phashes[] = {
0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
}
},
+ { 0xa00123b, {
+ 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2,
+ 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3,
+ 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28,
+ 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72,
+ }
+ },
{ 0xa00820c, {
0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
@@ -301,6 +315,13 @@ static const struct patch_digest phashes[] = {
0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
}
},
+ { 0xa00820d, {
+ 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4,
+ 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca,
+ 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1,
+ 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7,
+ }
+ },
{ 0xa10113e, {
0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
@@ -322,6 +343,13 @@ static const struct patch_digest phashes[] = {
0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
}
},
+ { 0xa10114c, {
+ 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64,
+ 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74,
+ 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a,
+ 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0,
+ }
+ },
{ 0xa10123e, {
0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
@@ -343,6 +371,13 @@ static const struct patch_digest phashes[] = {
0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
}
},
+ { 0xa10124c, {
+ 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90,
+ 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46,
+ 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc,
+ 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2,
+ }
+ },
{ 0xa108108, {
0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
@@ -350,6 +385,13 @@ static const struct patch_digest phashes[] = {
0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
}
},
+ { 0xa108109, {
+ 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa,
+ 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b,
+ 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35,
+ 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59,
+ }
+ },
{ 0xa20102d, {
0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
@@ -357,6 +399,13 @@ static const struct patch_digest phashes[] = {
0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
}
},
+ { 0xa20102e, {
+ 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd,
+ 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6,
+ 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5,
+ 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe,
+ }
+ },
{ 0xa201210, {
0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
@@ -364,6 +413,13 @@ static const struct patch_digest phashes[] = {
0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
}
},
+ { 0xa201211, {
+ 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95,
+ 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27,
+ 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff,
+ 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27,
+ }
+ },
{ 0xa404107, {
0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
@@ -371,6 +427,13 @@ static const struct patch_digest phashes[] = {
0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
}
},
+ { 0xa404108, {
+ 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc,
+ 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb,
+ 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19,
+ 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00,
+ }
+ },
{ 0xa500011, {
0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
@@ -378,6 +441,13 @@ static const struct patch_digest phashes[] = {
0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
}
},
+ { 0xa500012, {
+ 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4,
+ 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16,
+ 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f,
+ 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63,
+ }
+ },
{ 0xa601209, {
0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
@@ -385,6 +455,13 @@ static const struct patch_digest phashes[] = {
0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
}
},
+ { 0xa60120a, {
+ 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d,
+ 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b,
+ 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d,
+ 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c,
+ }
+ },
{ 0xa704107, {
0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
@@ -392,6 +469,13 @@ static const struct patch_digest phashes[] = {
0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
}
},
+ { 0xa704108, {
+ 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93,
+ 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98,
+ 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49,
+ 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a,
+ }
+ },
{ 0xa705206, {
0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
@@ -399,6 +483,13 @@ static const struct patch_digest phashes[] = {
0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
}
},
+ { 0xa705208, {
+ 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19,
+ 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2,
+ 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11,
+ 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff,
+ }
+ },
{ 0xa708007, {
0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
@@ -406,6 +497,13 @@ static const struct patch_digest phashes[] = {
0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
}
},
+ { 0xa708008, {
+ 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46,
+ 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab,
+ 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16,
+ 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b,
+ }
+ },
{ 0xa70c005, {
0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
@@ -413,6 +511,13 @@ static const struct patch_digest phashes[] = {
0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
}
},
+ { 0xa70c008, {
+ 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21,
+ 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e,
+ 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66,
+ 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0,
+ }
+ },
{ 0xaa00116, {
0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
@@ -441,4 +546,11 @@ static const struct patch_digest phashes[] = {
0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
}
},
+ { 0xaa00216, {
+ 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5,
+ 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08,
+ 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2,
+ 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1,
+ }
+ },
};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3658d11e7b6..e8021d3e5882 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -41,8 +41,8 @@
#include "internal.h"
-static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
@@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void)
u32 lvl, dummy, i;
u32 *levels;
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
levels = final_levels;
@@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void)
return false;
}
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
{
- static const char *__dis_opt_str = "dis_ucode_ldr";
- const char *cmdline = boot_command_line;
- const char *option = __dis_opt_str;
+ if (dis_ucode_ldr)
+ return true;
/*
- * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
- * completely accurate as xen pv guests don't see that CPUID bit set but
- * that's good enough as they don't land on the BSP path anyway.
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
*/
- if (native_cpuid_ecx(1) & BIT(31))
- return true;
-
- if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
- if (amd_check_current_patch_level())
- return true;
- }
-
- if (cmdline_find_option_bool(cmdline, option) <= 0)
- dis_ucode_ldr = false;
+ if (!have_cpuid_p() ||
+ native_cpuid_ecx(1) & BIT(31) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
return dis_ucode_ldr;
}
@@ -125,7 +130,10 @@ void __init load_ucode_bsp(void)
unsigned int cpuid_1_eax;
bool intel = true;
- if (!have_cpuid_p())
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+
+ if (microcode_loader_disabled())
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -146,9 +154,6 @@ void __init load_ucode_bsp(void)
return;
}
- if (check_loader_disabled_bsp())
- return;
-
if (intel)
load_ucode_intel_bsp(&early_data);
else
@@ -159,6 +164,11 @@ void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
if (dis_ucode_ldr)
return;
@@ -686,6 +696,8 @@ static int load_late_locked(void)
return load_late_stop_cpus(true);
case UCODE_NFOUND:
return -ENOENT;
+ case UCODE_OK:
+ return 0;
default:
return -EBADFD;
}
@@ -810,7 +822,7 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (dis_ucode_ldr)
+ if (microcode_loader_disabled())
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 815fa67356a2..362cc71bbc86 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig)
sig->pf = 0;
sig->rev = intel_get_microcode_revision();
- if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
unsigned int val[2];
/* get processor flags from MSR 0x17 */
@@ -395,7 +395,7 @@ static int __init save_builtin_microcode(void)
if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
return 0;
- if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
uci.mc = get_microcode_blob(&uci, true);
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 5df621752fef..50a9702ae4e2 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void)
return x86_family(eax);
}
-extern bool dis_ucode_ldr;
extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7b29ebda024f..1ececfce7a46 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -591,7 +591,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (boot_cpu_has(X86_FEATURE_MTRR))
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index d7163b764c62..2d48db66fca8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -148,7 +148,8 @@ static int closid_alloc(void)
lockdep_assert_held(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+ is_llc_occupancy_enabled()) {
cleanest_closid = resctrl_find_cleanest_closid();
if (cleanest_closid < 0)
return cleanest_closid;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c84c30188fdf..bc4993aa41ed 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -49,6 +49,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 22b65a5f5ec6..7f8d1e11dbee 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -150,13 +150,15 @@ int __init sgx_drv_init(void)
u64 xfrm_mask;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
+ }
cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
if (!(eax & 1)) {
- pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
}
@@ -173,8 +175,10 @@ int __init sgx_drv_init(void)
}
ret = misc_register(&sgx_dev_enclave);
- if (ret)
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
return ret;
+ }
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index b65ab214bdf5..776a20172867 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -64,6 +64,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 9ace84486499..147ea26dfdad 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -719,6 +719,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 00189cdeb775..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/clocksource.h>
#include <linux/cpu.h>
+#include <linux/efi.h>
#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/div64.h>
@@ -429,6 +430,9 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
vmware_paravirt_ops_setup();
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 59d23cdf4ed0..dd8748c45529 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,6 +2,7 @@
/*
* Architecture specific OF callbacks.
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/io.h>
#include <linux/interrupt.h>
@@ -313,6 +314,6 @@ void __init x86_flattree_get_config(void)
if (initial_dtb)
early_memunmap(dt, map_len);
#endif
- if (of_have_populated_dt())
+ if (acpi_disabled && of_have_populated_dt())
x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index a7d562697e50..b2b118a8c09b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -195,6 +195,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ?: get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -213,9 +214,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for (stack = stack ?: get_stack_pointer(task, regs);
- stack;
- stack = stack_info.next_sp) {
+ for (; stack; stack = stack_info.next_sp) {
const char *stack_name;
stack = PTR_ALIGN(stack, sizeof(long));
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4893d30ce438..b4746eb8b115 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -754,22 +754,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
void __init e820__register_nosave_regions(unsigned long limit_pfn)
{
int i;
- unsigned long pfn = 0;
+ u64 last_addr = 0;
for (i = 0; i < e820_table->nr_entries; i++) {
struct e820_entry *entry = &e820_table->entries[i];
- if (pfn < PFN_UP(entry->addr))
- register_nosave_region(pfn, PFN_UP(entry->addr));
-
- pfn = PFN_DOWN(entry->addr + entry->size);
-
if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
- register_nosave_region(PFN_UP(entry->addr), pfn);
+ continue;
- if (pfn >= limit_pfn)
- break;
+ if (last_addr < entry->addr)
+ register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr));
+
+ last_addr = entry->addr + entry->size;
}
+
+ register_nosave_region(PFN_DOWN(last_addr), limit_pfn);
}
#ifdef CONFIG_ACPI
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1209c7aebb21..dcac3c058fb7 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -220,7 +220,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
struct fpstate *fpstate;
unsigned int size;
- size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
+ size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
fpstate = vzalloc(size);
if (!fpstate)
return false;
@@ -232,8 +232,8 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
fpstate->is_guest = true;
gfpu->fpstate = fpstate;
- gfpu->xfeatures = fpu_user_cfg.default_features;
- gfpu->perm = fpu_user_cfg.default_features;
+ gfpu->xfeatures = fpu_kernel_cfg.default_features;
+ gfpu->perm = fpu_kernel_cfg.default_features;
/*
* KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 8f62e0666dea..8abe60919e2f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -119,7 +119,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
{
struct xregs_state __user *x = buf;
struct _fpx_sw_bytes sw_bytes = {};
- u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
@@ -133,12 +132,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
(__u32 __user *)(buf + fpstate->user_size));
/*
- * Read the xfeatures which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers.
- */
- err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
-
- /*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context. This will
* enable us capturing any changes(during sigreturn) to
@@ -149,9 +142,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
* header as well as change any contents in the memory layout.
* xrestore as part of sigreturn will capture all the changes.
*/
- xfeatures |= XFEATURE_MASK_FPSSE;
-
- err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
+ err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE);
return !err;
}
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index aa16f1a1bbcf..f7d8f3d73599 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -69,21 +69,31 @@ static inline u64 xfeatures_mask_independent(void)
return fpu_kernel_cfg.independent_features;
}
+static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
+{
+ u64 xfeatures;
+ int err;
+
+ /* Read the xfeatures value already saved in the user buffer */
+ err = __get_user(xfeatures, &xbuf->header.xfeatures);
+ xfeatures |= mask;
+ err |= __put_user(xfeatures, &xbuf->header.xfeatures);
+
+ return err;
+}
+
/*
* Update the value of PKRU register that was already pushed onto the signal frame.
*/
-static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru)
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
{
- u64 xstate_bv;
int err;
if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
return 0;
/* Mark PKRU as in-use so that it is restored correctly. */
- xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU;
-
- err = __put_user(xstate_bv, &buf->header.xfeatures);
+ err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
if (err)
return err;
@@ -304,7 +314,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr
clac();
if (!err)
- err = update_pkru_in_sigframe(buf, mask, pkru);
+ err = update_pkru_in_sigframe(buf, pkru);
return err;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8da0e66ca22d..bfab966ea56e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -354,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
goto fail;
ip = trampoline + size;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(ip))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
memcpy(ip, retq, sizeof(retq));
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index de001b2146ab..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void)
*ptr = (unsigned long)ptep + PAGE_OFFSET;
#ifdef CONFIG_MICROCODE_INITRD32
- /* Running on a hypervisor? */
- if (native_cpuid_ecx(1) & BIT(31))
- return;
-
params = (struct boot_params *)__pa_nodebug(&boot_params);
if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
return;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 80e262bb627f..cb9852ad6098 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -46,7 +46,8 @@ bool __init pit_timer_init(void)
* VMMs otherwise steal CPU time just to pointlessly waggle
* the (masked) IRQ.
*/
- clockevent_i8253_disable();
+ scoped_guard(irq)
+ clockevent_i8253_disable();
return false;
}
clockevent_i8253_init(true);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e2fab3ceb09f..9a101150376d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk)
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
{
+ struct task_struct *tsk = current;
struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk)
struct io_bitmap *iobm = tsk->thread.io_bitmap;
tsk->thread.io_bitmap = NULL;
- task_update_io_bitmap(tsk);
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
}
t->iopl_emul = level;
- task_update_io_bitmap(current);
-
+ task_update_io_bitmap();
return 0;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 385e3a5fc304..85fa2db38dc4 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -25,8 +25,10 @@
#include <asm/posted_intr.h>
#include <asm/irq_remapping.h>
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR)
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
+#endif
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -412,7 +414,7 @@ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
bool handled = false;
for (i = 0; i < 4; i++)
- pir_copy[i] = pir[i];
+ pir_copy[i] = READ_ONCE(pir[i]);
for (i = 0; i < 4; i++) {
if (!pir_copy[i])
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 9c9faa1634fb..e5faeec20b1f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs)
struct perf_event *bp;
/* Disable hardware debugging while we are in kgdb: */
- set_debugreg(0UL, 7);
+ set_debugreg(DR7_FIXED_1, 7);
for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 837450b6e882..1e231dac61e3 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -251,6 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
+ its_init_mod(me);
+
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -271,6 +273,9 @@ int module_finalize(const Elf_Ehdr *hdr,
void *rseg = (void *)retpolines->sh_addr;
apply_retpolines(rseg, rseg + retpolines->sh_size);
}
+
+ its_fini_mod(me);
+
if (returns) {
void *rseg = (void *)returns->sh_addr;
apply_returns(rseg, rseg + returns->sh_size);
@@ -318,4 +323,5 @@ int module_finalize(const Elf_Ehdr *hdr,
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
+ its_free_mod(mod);
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ed163c8c8604..9a95d00f1423 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -40,8 +40,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
+/*
+ * An emergency handler can be set in any context including NMI
+ */
struct nmi_desc {
raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
struct list_head head;
};
@@ -132,9 +136,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
struct nmiaction *a;
int handled=0;
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
rcu_read_lock();
/*
@@ -224,6 +241,31 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index fec381533555..0c1b915d7efa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -100,6 +100,11 @@ int paravirt_disable_iospace(void)
return request_resource(&ioport_resource, &reserve_ioports);
}
+static noinstr void pv_native_safe_halt(void)
+{
+ native_safe_halt();
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static noinstr void pv_native_write_cr2(unsigned long val)
{
@@ -121,10 +126,6 @@ noinstr void pv_native_wbinvd(void)
native_wbinvd();
}
-static noinstr void pv_native_safe_halt(void)
-{
- native_safe_halt();
-}
#endif
struct pv_info pv_info = {
@@ -182,9 +183,11 @@ struct paravirt_patch_template pv_ops = {
.irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
+
+ /* Irq HLT ops. */
.irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
-#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb_local,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15507e739c25..4c9c98c5deab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -92,7 +92,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- memcpy(dst, src, arch_task_struct_size);
+ /* init_task is not dynamically sized (incomplete FPU state) */
+ if (unlikely(src == &init_task))
+ memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0);
+ else
+ memcpy(dst, src, arch_task_struct_size);
+
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
@@ -175,6 +180,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -463,6 +469,11 @@ void native_tss_update_io_bitmap(void)
} else {
struct io_bitmap *iobm = t->io_bitmap;
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
/*
* Only copy bitmap data when the sequence number differs. The
* update time is accounted to the incoming task.
@@ -900,19 +911,24 @@ static __init bool prefer_mwait_c1_over_halt(void)
*/
static __cpuidle void mwait_idle(void)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (!current_set_polling_and_test()) {
- if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
- mb(); /* quirk */
- clflush((void *)&current_thread_info()->flags);
- mb(); /* quirk */
- }
+ const void *addr = &current_thread_info()->flags;
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched()) {
- __sti_mwait(0, 0);
- raw_local_irq_disable();
- }
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
+ if (need_resched())
+ goto out;
+
+ __sti_mwait(0, 0);
+ raw_local_irq_disable();
}
+
+out:
__current_clr_polling();
}
@@ -933,7 +949,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0917c7f25720..f10c14cb6ef8 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
/* Only print out debug registers if they are in their non-default state. */
if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))
return;
printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 226472332a70..266366a945ed 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -132,7 +132,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
/* Only print out debug registers if they are in their non-default state. */
if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
log_lvl, d0, d1, d2);
printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index dc1dd3f3e67f..9aaac1f9f45b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -926,15 +926,11 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
- NMI_FLAG_FIRST, "crash"))
- return; /* Return what? */
+
/*
- * Ensure the new callback function is set before sending
- * out the NMI
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
apic_send_IPI_allbutself(NMI_VECTOR);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index ef654530bf5a..42bbc42bd350 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -33,25 +33,55 @@
#include <asm/smap.h>
#include <asm/gsseg.h>
+/*
+ * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0
+ * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index
+ * GDT, selector values 0~3 all point to the NULL descriptor, thus values
+ * 0, 1, 2 and 3 are all valid NULL selector values.
+ *
+ * However IRET zeros ES, FS, GS, and DS segment registers if any of them
+ * is found to have any nonzero NULL selector value, which can be used by
+ * userspace in pre-FRED systems to spot any interrupt/exception by loading
+ * a nonzero NULL selector and waiting for it to become zero. Before FRED
+ * there was nothing software could do to prevent such an information leak.
+ *
+ * ERETU, the only legit instruction to return to userspace from kernel
+ * under FRED, by design does NOT zero any segment register to avoid this
+ * problem behavior.
+ *
+ * As such, leave NULL selector values 0~3 unchanged.
+ */
+static inline u16 fixup_rpl(u16 sel)
+{
+ return sel <= 3 ? sel : sel | 3;
+}
+
#ifdef CONFIG_IA32_EMULATION
#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
- unsigned int cur;
+ u16 cur;
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
savesegment(gs, cur);
- if ((sc->gs | 0x03) != cur)
- load_gs_index(sc->gs | 0x03);
+ if (fixup_rpl(sc->gs) != cur)
+ load_gs_index(fixup_rpl(sc->gs));
savesegment(fs, cur);
- if ((sc->fs | 0x03) != cur)
- loadsegment(fs, sc->fs | 0x03);
+ if (fixup_rpl(sc->fs) != cur)
+ loadsegment(fs, fixup_rpl(sc->fs));
+
savesegment(ds, cur);
- if ((sc->ds | 0x03) != cur)
- loadsegment(ds, sc->ds | 0x03);
+ if (fixup_rpl(sc->ds) != cur)
+ loadsegment(ds, fixup_rpl(sc->ds));
savesegment(es, cur);
- if ((sc->es | 0x03) != cur)
- loadsegment(es, sc->es | 0x03);
+ if (fixup_rpl(sc->es) != cur)
+ loadsegment(es, fixup_rpl(sc->es));
}
#define sigset32_t compat_sigset_t
@@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
regs->orig_ax = -1;
#ifdef CONFIG_IA32_EMULATION
- /*
- * Reload fs and gs if they have changed in the signal
- * handler. This does not handle long fs/gs base changes in
- * the handler, but does not clobber them at least in the
- * normal case.
- */
reload_segments(&sc);
#else
- loadsegment(gs, sc.gs);
- regs->fs = sc.fs;
- regs->es = sc.es;
- regs->ds = sc.ds;
+ loadsegment(gs, fixup_rpl(sc.gs));
+ regs->fs = fixup_rpl(sc.fs);
+ regs->es = fixup_rpl(sc.es);
+ regs->ds = fixup_rpl(sc.ds);
#endif
return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
@@ -128,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn)
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
+ prevent_single_step_upon_eretu(regs);
+
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
if (__get_user(set.sig[0], &frame->sc.oldmask)
@@ -151,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn)
struct rt_sigframe_ia32 __user *frame;
sigset_t set;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
if (!access_ok(frame, sizeof(*frame)))
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ee9453891901..d483b585c6c6 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
@@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
sigset_t set;
unsigned long uc_flags;
+ prevent_single_step_upon_eretu(regs);
+
frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
if (!access_ok(frame, sizeof(*frame)))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f1fac08fdef2..2c451de702c8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -681,9 +681,9 @@ static void __init smp_quirk_init_udelay(void)
return;
/* if modern processor, use no delay */
- if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
init_udelay = 0;
return;
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9e51242ed125..aae909d4ed78 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
break;
case RET:
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(insn))
code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
else
code = &retinsn;
@@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
case JCC:
if (!func) {
func = __static_call_return;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk())
func = x86_return_thunk;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2dbadf347b5f..243f3bc9b4dc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -94,10 +94,17 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
/*
* Check for UD1 or UD2, accounting for Address Size Override Prefixes.
- * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ * If it's a UD1, further decode to determine its use:
+ *
+ * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
+ * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
+ * static_call: 0f b9 cc ud1 %esp,%ecx
+ *
+ * Notably UBSAN uses EAX, static_call uses ECX.
*/
-__always_inline int decode_bug(unsigned long addr, u32 *imm)
+__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
+ unsigned long start = addr;
u8 v;
if (addr < TASK_SIZE_MAX)
@@ -110,24 +117,42 @@ __always_inline int decode_bug(unsigned long addr, u32 *imm)
return BUG_NONE;
v = *(u8 *)(addr++);
- if (v == SECOND_BYTE_OPCODE_UD2)
+ if (v == SECOND_BYTE_OPCODE_UD2) {
+ *len = addr - start;
return BUG_UD2;
+ }
- if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+ if (v != SECOND_BYTE_OPCODE_UD1)
return BUG_NONE;
- /* Retrieve the immediate (type value) for the UBSAN UD1 */
- v = *(u8 *)(addr++);
- if (X86_MODRM_RM(v) == 4)
- addr++;
-
*imm = 0;
- if (X86_MODRM_MOD(v) == 1)
- *imm = *(u8 *)addr;
- else if (X86_MODRM_MOD(v) == 2)
- *imm = *(u32 *)addr;
- else
- WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+ v = *(u8 *)(addr++); /* ModRM */
+
+ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
+ addr++; /* SIB */
+
+ /* Decode immediate, if present */
+ switch (X86_MODRM_MOD(v)) {
+ case 0: if (X86_MODRM_RM(v) == 5)
+ addr += 4; /* RIP + disp32 */
+ break;
+
+ case 1: *imm = *(s8 *)addr;
+ addr += 1;
+ break;
+
+ case 2: *imm = *(s32 *)addr;
+ addr += 4;
+ break;
+
+ case 3: break;
+ }
+
+ /* record instruction length */
+ *len = addr - start;
+
+ if (X86_MODRM_REG(v) == 0) /* EAX */
+ return BUG_UD1_UBSAN;
return BUG_UD1;
}
@@ -258,10 +283,10 @@ static inline void handle_invalid_op(struct pt_regs *regs)
static noinstr bool handle_bug(struct pt_regs *regs)
{
bool handled = false;
- int ud_type;
- u32 imm;
+ int ud_type, ud_len;
+ s32 ud_imm;
- ud_type = decode_bug(regs->ip, &imm);
+ ud_type = decode_bug(regs->ip, &ud_imm, &ud_len);
if (ud_type == BUG_NONE)
return handled;
@@ -281,15 +306,28 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (ud_type == BUG_UD2) {
+
+ switch (ud_type) {
+ case BUG_UD2:
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
- regs->ip += LEN_UD2;
+ regs->ip += ud_len;
handled = true;
}
- } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
- pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip);
+ break;
+
+ case BUG_UD1_UBSAN:
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n",
+ report_ubsan_failure(regs, ud_imm),
+ (void *)regs->ip);
+ }
+ break;
+
+ default:
+ break;
}
+
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_disable();
instrumentation_end();
@@ -380,6 +418,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
#endif
/*
+ * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64
+ * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch
+ * between configs triggers objtool warnings.
+ *
+ * This is a temporary hack until we have compiler or plugin support for
+ * annotating noreturns.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+#define always_true() true
+#else
+bool always_true(void);
+bool __weak always_true(void) { return true; }
+#endif
+
+/*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
@@ -514,7 +567,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
- panic("Machine halted.");
+ if (always_true())
+ panic("Machine halted.");
instrumentation_end();
}
@@ -923,24 +977,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif
}
-static __always_inline unsigned long debug_read_clear_dr6(void)
+static __always_inline unsigned long debug_read_reset_dr6(void)
{
unsigned long dr6;
+ get_debugreg(dr6, 6);
+ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
+
/*
* The Intel SDM says:
*
- * Certain debug exceptions may clear bits 0-3. The remaining
- * contents of the DR6 register are never cleared by the
- * processor. To avoid confusion in identifying debug
- * exceptions, debug handlers should clear the register before
- * returning to the interrupted task.
+ * Certain debug exceptions may clear bits 0-3 of DR6.
*
- * Keep it simple: clear DR6 immediately.
+ * BLD induced #DB clears DR6.BLD and any other debug
+ * exception doesn't modify DR6.BLD.
+ *
+ * RTM induced #DB clears DR6.RTM and any other debug
+ * exception sets DR6.RTM.
+ *
+ * To avoid confusion in identifying debug exceptions,
+ * debug handlers should set DR6.BLD and DR6.RTM, and
+ * clear other DR6 bits before returning.
+ *
+ * Keep it simple: write DR6 with its architectural reset
+ * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately.
*/
- get_debugreg(dr6, 6);
set_debugreg(DR6_RESERVED, 6);
- dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
return dr6;
}
@@ -1140,13 +1202,13 @@ out:
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
- exc_debug_kernel(regs, debug_read_clear_dr6());
+ exc_debug_kernel(regs, debug_read_reset_dr6());
}
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
- exc_debug_user(regs, debug_read_clear_dr6());
+ exc_debug_user(regs, debug_read_reset_dr6());
}
#ifdef CONFIG_X86_FRED
@@ -1165,7 +1227,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
{
/*
* FRED #DB stores DR6 on the stack in the format which
- * debug_read_clear_dr6() returns for the IDT entry points.
+ * debug_read_reset_dr6() returns for the IDT entry points.
*/
unsigned long dr6 = fred_event_data(regs);
@@ -1180,7 +1242,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug)
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
- unsigned long dr6 = debug_read_clear_dr6();
+ unsigned long dr6 = debug_read_reset_dr6();
if (user_mode(regs))
exc_debug_user(regs, dr6);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index dfe6847fd99e..310d8cdf7ca3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -956,7 +956,7 @@ static unsigned long long cyc2ns_suspend;
void tsc_save_sched_clock_state(void)
{
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@@ -976,7 +976,7 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
- if (!sched_clock_stable())
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
return;
local_irq_save(flags);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 5a952c5ea66b..9194695662b2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,19 +357,23 @@ void *arch_uprobe_trampoline(unsigned long *psize)
return &insn;
}
-static unsigned long trampoline_check_ip(void)
+static unsigned long trampoline_check_ip(unsigned long tramp)
{
- unsigned long tramp = uprobe_get_trampoline_vaddr();
-
return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
}
SYSCALL_DEFINE0(uretprobe)
{
struct pt_regs *regs = task_pt_regs(current);
- unsigned long err, ip, sp, r11_cx_ax[3];
+ unsigned long err, ip, sp, r11_cx_ax[3], tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
- if (regs->ip != trampoline_check_ip())
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
goto sigill;
err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index feb8102a9ca7..e2567c8f6bac 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -530,4 +530,14 @@ INIT_PER_CPU(irq_stack_backing_store);
"SRSO function pair won't alias");
#endif
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
+#endif
+
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
+
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 83bfecd1a6e4..8f587c5bb6bc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -814,6 +814,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
+ F(VERW_CLEAR) |
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
F(WRMSR_XX_BASE_NS)
);
@@ -821,11 +822,19 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_VERW_CLEAR);
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0021_ECX,
+ F(TSA_SQ_NO) | F(TSA_L1_NO)
+ );
+
+ kvm_cpu_cap_check_and_set(X86_FEATURE_TSA_SQ_NO);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_TSA_L1_NO);
+
/*
* Synthesize "LFENCE is serializing" into the AMD-defined entry in
* KVM's supported CPUID if the feature is reported as supported by the
@@ -1047,8 +1056,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
}
break;
case 0xa: { /* Architectural Performance Monitoring */
- union cpuid10_eax eax;
- union cpuid10_edx edx;
+ union cpuid10_eax eax = { };
+ union cpuid10_edx edx = { };
if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
@@ -1064,8 +1073,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
if (kvm_pmu_cap.version)
edx.split.anythread_deprecated = 1;
- edx.split.reserved1 = 0;
- edx.split.reserved2 = 0;
entry->eax = eax.full;
entry->ebx = kvm_pmu_cap.events_mask;
@@ -1378,16 +1385,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ entry->ebx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
break;
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
- union cpuid_0x80000022_ebx ebx;
+ union cpuid_0x80000022_ebx ebx = { };
entry->ecx = entry->edx = 0;
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
- entry->eax = entry->ebx;
+ entry->eax = entry->ebx = 0;
break;
}
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ad479cfb91bc..f16a7b2c2adc 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -2,7 +2,6 @@
#ifndef ARCH_X86_KVM_CPUID_H
#define ARCH_X86_KVM_CPUID_H
-#include "x86.h"
#include "reverse_cpuid.h"
#include <asm/cpu.h>
#include <asm/processor.h>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e72aed25d721..60986f67c35a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -651,9 +651,10 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
}
static inline bool emul_is_noncanonical_address(u64 la,
- struct x86_emulate_ctxt *ctxt)
+ struct x86_emulate_ctxt *ctxt,
+ unsigned int flags)
{
- return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt));
+ return !ctxt->ops->is_canonical_addr(ctxt, la, flags);
}
/*
@@ -1733,7 +1734,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (ret != X86EMUL_CONTINUE)
return ret;
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32), ctxt))
+ ((u64)base3 << 32), ctxt,
+ X86EMUL_F_DT_LOAD))
return emulate_gp(ctxt, err_code);
}
@@ -2516,8 +2518,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
- if (emul_is_noncanonical_address(rcx, ctxt) ||
- emul_is_noncanonical_address(rdx, ctxt))
+ if (emul_is_noncanonical_address(rcx, ctxt, 0) ||
+ emul_is_noncanonical_address(rdx, ctxt, 0))
return emulate_gp(ctxt, 0);
break;
}
@@ -3494,7 +3496,8 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->mode == X86EMUL_MODE_PROT64 &&
- emul_is_noncanonical_address(desc_ptr.address, ctxt))
+ emul_is_noncanonical_address(desc_ptr.address, ctxt,
+ X86EMUL_F_DT_LOAD))
return emulate_gp(ctxt, 0);
if (lgdt)
ctxt->ops->set_gdt(ctxt, &desc_ptr);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 44c88537448c..79d06a8a5b7d 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1980,6 +1980,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
goto out_flush_all;
+ if (is_noncanonical_invlpg_address(entries[i], vcpu))
+ continue;
+
/*
* Lower 12 bits of 'address' encode the number of additional
* pages to flush.
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 55a18e2f2dcd..10495fffb890 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -94,6 +94,8 @@ struct x86_instruction_info {
#define X86EMUL_F_FETCH BIT(1)
#define X86EMUL_F_IMPLICIT BIT(2)
#define X86EMUL_F_INVLPG BIT(3)
+#define X86EMUL_F_MSR BIT(4)
+#define X86EMUL_F_DT_LOAD BIT(5)
struct x86_emulate_ops {
void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
@@ -235,6 +237,9 @@ struct x86_emulate_ops {
gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
unsigned int flags);
+
+ bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
};
/* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9dc5dd43ae7f..e9322358678b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#include "x86.h"
#include "cpuid.h"
extern bool __read_mostly enable_mmio_caching;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 19c96278ba75..8edfb4e4a73d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6234,7 +6234,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(addr, vcpu))
+ if (is_noncanonical_invlpg_address(addr, vcpu))
return;
kvm_x86_call(flush_tlb_gva)(vcpu, addr);
@@ -7589,7 +7589,7 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
kvm_nx_huge_page_recovery_worker_kill,
kvm, "kvm-nx-lpage-recovery");
- if (!nx_thread)
+ if (IS_ERR(nx_thread))
return;
vhost_task_start(nx_thread);
@@ -7616,9 +7616,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
@@ -7633,27 +7654,49 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;
- return kvm_unmap_gfn_range(kvm, range);
-}
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
-static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
-}
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
-static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
-}
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
-static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
- int level)
-{
- lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
+ return kvm_unmap_gfn_range(kvm, range);
}
+
+
static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, int level, unsigned long attrs)
{
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 05490b9d8a43..6f74e2b27c1e 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,6 +19,7 @@
#include <asm/mtrr.h>
#include "cpuid.h"
+#include "x86.h"
static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 0d17d6b70639..0ea847b82354 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -18,6 +18,7 @@ enum kvm_only_cpuid_leafs {
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
CPUID_24_0_EBX,
+ CPUID_8000_0021_ECX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -68,6 +69,10 @@ enum kvm_only_cpuid_leafs {
/* CPUID level 0x80000022 (EAX) */
#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -98,6 +103,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
[CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
};
/*
@@ -137,6 +143,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
default:
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 85241c0c7f56..b0cab215b08f 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
kvm_mmu_reset_context(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_smm_changed);
void process_smi(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 4b74ea91f4e6..63dea8ecd7ef 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -820,7 +820,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -896,6 +896,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
+ bool enable_remapped_mode = true;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
@@ -933,6 +934,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_vcpu_apicv_active(&svm->vcpu)) {
struct amd_iommu_pi_data pi;
+ enable_remapped_mode = false;
+
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
@@ -951,33 +954,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
*/
if (!ret && pi.is_guest_mode)
svm_ir_list_add(svm, &pi);
- } else {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
-
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
-
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
}
if (!ret && svm) {
@@ -993,6 +969,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
ret = 0;
+ if (enable_remapped_mode) {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
@@ -1199,6 +1203,12 @@ bool avic_hardware_setup(void)
return false;
}
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
+ !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
+ pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
+ return false;
+ }
+
if (boot_cpu_has(X86_FEATURE_AVIC)) {
pr_info("AVIC enabled\n");
} else if (force_avic) {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index e9af87b12814..c4ae73541fc5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2058,6 +2058,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
struct kvm_vcpu *src_vcpu;
unsigned long i;
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
if (!sev_es_guest(src))
return 0;
@@ -3957,16 +3961,12 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
/*
* The target vCPU is valid, so the vCPU will be kicked unless the
- * request is for CREATE_ON_INIT. For any errors at this stage, the
- * kick will place the vCPU in an non-runnable state.
+ * request is for CREATE_ON_INIT.
*/
kick = true;
mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
- target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
- target_svm->sev_es.snp_ap_waiting_for_reset = true;
-
/* Interrupt injection mode shouldn't change for AP creation */
if (request < SVM_VMGEXIT_AP_DESTROY) {
u64 sev_features;
@@ -4012,20 +4012,23 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
break;
case SVM_VMGEXIT_AP_DESTROY:
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
break;
default:
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
request);
ret = -EINVAL;
- break;
+ goto out;
}
-out:
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
if (kick) {
kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
kvm_vcpu_kick(target_vcpu);
}
+out:
mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
return ret;
@@ -4579,6 +4582,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
+ struct kvm *kvm = svm->vcpu.kvm;
+
/*
* All host state for SEV-ES guests is categorized into three swap types
* based on how it is handled by hardware during a world switch:
@@ -4602,10 +4607,15 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
- * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
- * saves and loads debug registers (Type-A).
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
+ * not save or load debug registers. Sadly, on CPUs without
+ * ALLOWED_SEV_FEATURES, KVM can't prevent SNP guests from enabling
+ * DebugSwap on secondary vCPUs without KVM's knowledge via "AP Create".
+ * Save all registers if DebugSwap is supported to prevent host state
+ * from being clobbered by a misbehaving guest.
*/
- if (sev_vcpu_has_debug_swap(svm)) {
+ if (sev_vcpu_has_debug_swap(svm) ||
+ (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a7cb7c82b38e..1f42a71b15c0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1483,7 +1483,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
{
int i;
- for_each_online_cpu(i)
+ for_each_possible_cpu(i)
cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
}
@@ -2222,6 +2222,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
*/
if (!sev_es_guest(vcpu->kvm)) {
clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
kvm_vcpu_reset(vcpu, true);
}
@@ -3167,6 +3171,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
+
+ /*
+ * AMD changed the architectural behavior of bits 5:2. On CPUs
+ * without BusLockTrap, bits 5:2 control "external pins", but
+ * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
+ * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
+ * the guest to set bits 5:2 despite not actually virtualizing
+ * Performance-Monitoring/Breakpoint external pins. Drop bits
+ * 5:2 for backwards compatibility.
+ */
+ data &= ~GENMASK(5, 2);
+
+ /*
+ * Suppress BTF as KVM doesn't virtualize BTF, but there's no
+ * way to communicate lack of support to the guest.
+ */
+ if (data & DEBUGCTLMSR_BTF) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ data &= ~DEBUGCTLMSR_BTF;
+ }
+
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
@@ -4176,6 +4201,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ /*
+ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
+ * VMRUN controls whether or not physical IRQs are masked (KVM always
+ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
+ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
+ * into guest state if delivery of an event during VMRUN triggers a
+ * #VMEXIT, and the guest_state transitions already tell lockdep that
+ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
+ * this path, so IRQs aren't actually unmasked while running host code.
+ */
+ raw_local_irq_enable();
+
amd_clear_divider();
if (sev_es_guest(vcpu->kvm))
@@ -4184,6 +4221,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);
+ raw_local_irq_disable();
+
guest_state_exit_irqoff();
}
@@ -4240,6 +4279,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
clgi();
kvm_load_guest_xsave_state(vcpu);
+ /*
+ * Hardware only context switches DEBUGCTL if LBR virtualization is
+ * enabled. Manually load DEBUGCTL if necessary (and restore it after
+ * VM-Exit), as running with the host's DEBUGCTL can negatively affect
+ * guest state and can even be fatal, e.g. due to Bus Lock Detect.
+ */
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(svm->vmcb->save.dbgctl);
+
kvm_wait_lapic_expire(vcpu);
/*
@@ -4267,6 +4316,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
kvm_load_host_xsave_state(vcpu);
stgi();
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 43fa6a16eb19..d114efac7af7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -591,7 +591,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
/* svm.c */
#define MSR_INVALID 0xffffffffU
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
extern bool dump_invalid_vmcb;
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 2ed80aea3bb1..235c4af6b692 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -169,13 +169,12 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
- /* Enter guest mode */
- sti
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+ /* Enter guest mode */
3: vmrun %_ASM_AX
4:
- cli
-
/* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX
@@ -339,13 +338,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov SVM_current_vmcb(%rdi), %rax
mov KVM_VMCB_pa(%rax), %rax
- /* Enter guest mode */
- sti
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+ /* Enter guest mode */
1: vmrun %rax
-
-2: cli
-
+2:
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index fab6a1ad98dc..fa41d036acd4 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#include "x86.h"
#include "../cpuid.h"
#include "hyperv.h"
#include "nested.h"
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 22bee8a71144..903e874041ac 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,6 +7,7 @@
#include <asm/debugreg.h>
#include <asm/mmu_context.h>
+#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
#include "mmu.h"
@@ -16,7 +17,6 @@
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
-#include "x86.h"
#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -3020,8 +3020,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+ if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
return -EINVAL;
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
@@ -3055,12 +3055,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(vmcs12->host_ss_selector == 0 && !ia32e))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
+ if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_noncanonical_address(vmcs12->host_rip, vcpu, 0)))
return -EINVAL;
/*
@@ -3178,7 +3178,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
return -EINVAL;
@@ -5172,7 +5172,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
- exn = is_noncanonical_address(*ret, vcpu);
+ exn = is_noncanonical_address(*ret, vcpu, 0);
} else {
/*
* When not in long mode, the virtual/linear address is
@@ -5983,7 +5983,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* invalidation.
*/
if (!operand.vpid ||
- is_noncanonical_address(operand.gla, vcpu))
+ is_noncanonical_invlpg_address(operand.gla, vcpu))
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 83382a4d1d66..9c9d4a336166 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -365,7 +365,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
case MSR_IA32_DS_AREA:
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
pmu->ds_area = data;
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index ec08fa3caf43..6b803324a981 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -274,6 +274,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
+ bool enable_remapped_mode = true;
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
@@ -312,21 +313,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
+ !kvm_irq_is_postable(&irq))
continue;
- }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
@@ -334,11 +322,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (!set)
+ continue;
+ enable_remapped_mode = false;
+
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
__func__);
@@ -346,6 +335,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
}
+ if (enable_remapped_mode)
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index a3c3d2a51f47..b352a3ba7354 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -4,12 +4,11 @@
#include <asm/sgx.h>
-#include "cpuid.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include "nested.h"
#include "sgx.h"
#include "vmx.h"
-#include "x86.h"
bool __read_mostly enable_sgx = 1;
module_param_named(sgx, enable_sgx, bool, 0444);
@@ -38,7 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
fault = true;
} else if (likely(is_64_bit_mode(vcpu))) {
*gva = vmx_get_untagged_addr(vcpu, *gva, 0);
- fault = is_noncanonical_address(*gva, vcpu);
+ fault = is_noncanonical_address(*gva, vcpu, 0);
} else {
*gva &= 0xffffffff;
fault = (s.unusable) ||
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1af30e3472cd..9a4ebf3dfbfc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -770,8 +770,11 @@ void vmx_emergency_disable_virtualization_cpu(void)
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}
@@ -1515,16 +1518,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2285,7 +2284,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
@@ -2450,7 +2449,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
@@ -7314,7 +7313,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mmio_stale_data_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
- mds_clear_cpu_buffers();
+ x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7454,8 +7453,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 41bf59bbc642..cf57fbf12104 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -339,8 +339,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- unsigned long host_debugctlmsr;
-
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b67a2f46e40b..213af0fda768 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1623,7 +1623,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1657,6 +1657,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_MDS_NO;
if (!boot_cpu_has_bug(X86_BUG_RFDS))
data |= ARCH_CAP_RFDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ data |= ARCH_CAP_ITS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
@@ -1843,7 +1845,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
@@ -1860,7 +1862,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
+ data = __canonical_address(data, max_host_virt_addr_bits());
break;
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
@@ -4590,6 +4592,11 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
+static inline u32 kvm_sync_valid_fields(struct kvm *kvm)
+{
+ return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -4698,7 +4705,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_SYNC_REGS:
- r = KVM_SYNC_X86_VALID_FIELDS;
+ r = kvm_sync_valid_fields(kvm);
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_VALID_FLAGS;
@@ -8601,6 +8608,12 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
addr, flags);
}
+static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, unsigned int flags)
+{
+ return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.vm_bugged = emulator_vm_bugged,
.read_gpr = emulator_read_gpr,
@@ -8647,6 +8660,7 @@ static const struct x86_emulate_ops emulate_ops = {
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
.get_untagged_addr = emulator_get_untagged_addr,
+ .is_canonical_addr = emulator_is_canonical_addr,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -10952,7 +10966,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
@@ -10961,9 +10975,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
}
+ vcpu->arch.host_debugctl = get_debugctlmsr();
+
guest_timing_enter_irqoff();
for (;;) {
@@ -11468,6 +11484,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
+ u32 sync_valid_fields;
int r;
r = kvm_mmu_post_init_vm(vcpu->kvm);
@@ -11513,8 +11530,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
- (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
+ sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
+ if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
+ (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
r = -EINVAL;
goto out;
}
@@ -11572,7 +11590,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
out:
kvm_put_guest_fpu(vcpu);
- if (kvm_run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -11760,6 +11778,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
if (kvm_mpx_supported())
kvm_load_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+
r = kvm_apic_accept_events(vcpu);
if (r < 0)
goto out;
@@ -11773,6 +11793,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state = vcpu->arch.mp_state;
out:
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
if (kvm_mpx_supported())
kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
@@ -12873,11 +12895,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
kvm_unload_vcpu_mmus(kvm);
+ kvm_destroy_vcpus(kvm);
kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
- kvm_destroy_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13542,15 +13564,22 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
int ret;
- irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+
return ret;
}
@@ -13560,9 +13589,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int ret;
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
WARN_ON(irqfd->producer != prod);
- irqfd->producer = NULL;
/*
* When producer of consumer is unregistered, we change back to
@@ -13570,12 +13599,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = NULL;
+
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+
kvm_arch_end_assignment(irqfd->kvm);
}
@@ -13588,7 +13623,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
- if (new->type != KVM_IRQ_ROUTING_MSI)
+ if (old->type != KVM_IRQ_ROUTING_MSI ||
+ new->type != KVM_IRQ_ROUTING_MSI)
return true;
return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
@@ -13727,7 +13763,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* invalidation.
*/
if ((!pcid_enabled && (operand.pcid != 0)) ||
- is_noncanonical_address(operand.gla, vcpu)) {
+ is_noncanonical_invlpg_address(operand.gla, vcpu)) {
kvm_inject_gp(vcpu, 0);
return 1;
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a84c48ef5278..ec623d23d13d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,7 @@
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+#include "cpuid.h"
struct kvm_caps {
/* control of guest tsc rate supported? */
@@ -233,9 +234,52 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
}
-static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+static inline u8 max_host_virt_addr_bits(void)
{
- return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
+ return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48;
+}
+
+/*
+ * x86 MSRs which contain linear addresses, x86 hidden segment bases, and
+ * IDT/GDT bases have static canonicality checks, the size of which depends
+ * only on the CPU's support for 5-level paging, rather than on the state of
+ * CR4.LA57. This applies to both WRMSR and to other instructions that set
+ * their values, e.g. SGDT.
+ *
+ * KVM passes through most of these MSRS and also doesn't intercept the
+ * instructions that set the hidden segment bases.
+ *
+ * Because of this, to be consistent with hardware, even if the guest doesn't
+ * have LA57 enabled in its CPUID, perform canonicality checks based on *host*
+ * support for 5 level paging.
+ *
+ * Finally, instructions which are related to MMU invalidation of a given
+ * linear address, also have a similar static canonical check on address.
+ * This allows for example to invalidate 5-level addresses of a guest from a
+ * host which uses 4-level paging.
+ */
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu,
+ unsigned int flags)
+{
+ if (flags & (X86EMUL_F_INVLPG | X86EMUL_F_MSR | X86EMUL_F_DT_LOAD))
+ return !__is_canonical_address(la, max_host_virt_addr_bits());
+ else
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
+}
+
+static inline bool is_noncanonical_msr_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_MSR);
+}
+
+static inline bool is_noncanonical_base_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_DT_LOAD);
+}
+
+static inline bool is_noncanonical_invlpg_address(u64 la, struct kvm_vcpu *vcpu)
+{
+ return is_noncanonical_address(la, vcpu, X86EMUL_F_INVLPG);
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 622fe24da910..1fc2035df404 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1472,7 +1472,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
sched_poll.nr_ports * sizeof(*ports), &e)) {
*r = -EFAULT;
- return true;
+ goto out;
}
for (i = 0; i < sched_poll.nr_ports; i++) {
@@ -1916,8 +1916,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
- if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
- return -EINVAL;
+ /*
+ * Don't check for the port being within range of max_evtchn_port().
+ * Userspace can configure what ever targets it likes; events just won't
+ * be delivered if/while the target is invalid, just like userspace can
+ * configure MSIs which target non-existent APICs.
+ *
+ * This allow on Live Migration and Live Update, the IRQ routing table
+ * can be restored *independently* of other things like creating vCPUs,
+ * without imposing an ordering dependency on userspace. In this
+ * particular case, the problematic ordering would be with setting the
+ * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
+ * instead of 1024 event channels.
+ */
/* We only support 2 level event channels for now */
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fc9fb5d06174..b8f74d80f35c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -74,6 +74,24 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 0b, 1b)
.Llarge_movsq:
+ /* Do the first possibly unaligned word */
+0: movq (%rsi),%rax
+1: movq %rax,(%rdi)
+
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+
+ /* What would be the offset to the aligned destination? */
+ leaq 8(%rdi),%rax
+ andq $-8,%rax
+ subq %rdi,%rax
+
+ /* .. and update pointers and count to match */
+ addq %rax,%rdi
+ addq %rax,%rsi
+ subq %rax,%rcx
+
+ /* make %rcx contain the number of words, %rax the remainder */
movq %rcx,%rax
shrq $3,%rcx
andl $7,%eax
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 391059b2c6fb..614fb9aee2ff 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -366,6 +366,45 @@ SYM_FUNC_END(call_depth_return_thunk)
#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
+#ifdef CONFIG_MITIGATION_ITS
+
+.macro ITS_THUNK reg
+
+SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%\reg
+ int3
+ .align 32, 0xcc /* fill to the end of the line */
+ .skip 32, 0xcc /* skip to the next upper half */
+.endm
+
+/* ITS mitigation requires thunks be aligned to upper half of cacheline */
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(__x86_indirect_its_thunk_array)
+
+#define GEN(reg) ITS_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+ .align 64, 0xcc
+SYM_CODE_END(__x86_indirect_its_thunk_array)
+
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(its_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_CODE_END(its_return_thunk)
+EXPORT_SYMBOL(its_return_thunk)
+
+#endif /* CONFIG_MITIGATION_ITS */
+
/*
* This function name is magical and is used by -mfunction-return=thunk-extern
* for the compiler to generate JMPs to it.
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index caedb3ef6688..cd3fd5155f6e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -35,7 +35,7 @@
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
#
-# REX2 Prefix
+# REX2 Prefix Superscripts
# - (!REX2): REX2 is not allowed
# - (REX2): REX2 variant e.g. JMPABS
@@ -286,10 +286,10 @@ df: ESC
# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
-e0: LOOPNE/LOOPNZ Jb (f64) (!REX2)
-e1: LOOPE/LOOPZ Jb (f64) (!REX2)
-e2: LOOP Jb (f64) (!REX2)
-e3: JrCXZ Jb (f64) (!REX2)
+e0: LOOPNE/LOOPNZ Jb (f64),(!REX2)
+e1: LOOPE/LOOPZ Jb (f64),(!REX2)
+e2: LOOP Jb (f64),(!REX2)
+e3: JrCXZ Jb (f64),(!REX2)
e4: IN AL,Ib (!REX2)
e5: IN eAX,Ib (!REX2)
e6: OUT Ib,AL (!REX2)
@@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2)
# in "near" jumps and calls is 16-bit. For CALL,
# push of return address is 16-bit wide, RSP is decremented by 2
# but is not truncated to 16 bits, unlike RIP.
-e8: CALL Jz (f64) (!REX2)
-e9: JMP-near Jz (f64) (!REX2)
-ea: JMP-far Ap (i64) (!REX2)
-eb: JMP-short Jb (f64) (!REX2)
+e8: CALL Jz (f64),(!REX2)
+e9: JMP-near Jz (f64),(!REX2)
+ea: JMP-far Ap (i64),(!REX2)
+eb: JMP-short Jb (f64),(!REX2)
ec: IN AL,DX (!REX2)
ed: IN eAX,DX (!REX2)
ee: OUT DX,AL (!REX2)
@@ -478,22 +478,22 @@ AVXcode: 1
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
# 0x0f 0x80-0x8f
# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
-80: JO Jz (f64) (!REX2)
-81: JNO Jz (f64) (!REX2)
-82: JB/JC/JNAE Jz (f64) (!REX2)
-83: JAE/JNB/JNC Jz (f64) (!REX2)
-84: JE/JZ Jz (f64) (!REX2)
-85: JNE/JNZ Jz (f64) (!REX2)
-86: JBE/JNA Jz (f64) (!REX2)
-87: JA/JNBE Jz (f64) (!REX2)
-88: JS Jz (f64) (!REX2)
-89: JNS Jz (f64) (!REX2)
-8a: JP/JPE Jz (f64) (!REX2)
-8b: JNP/JPO Jz (f64) (!REX2)
-8c: JL/JNGE Jz (f64) (!REX2)
-8d: JNL/JGE Jz (f64) (!REX2)
-8e: JLE/JNG Jz (f64) (!REX2)
-8f: JNLE/JG Jz (f64) (!REX2)
+80: JO Jz (f64),(!REX2)
+81: JNO Jz (f64),(!REX2)
+82: JB/JC/JNAE Jz (f64),(!REX2)
+83: JAE/JNB/JNC Jz (f64),(!REX2)
+84: JE/JZ Jz (f64),(!REX2)
+85: JNE/JNZ Jz (f64),(!REX2)
+86: JBE/JNA Jz (f64),(!REX2)
+87: JA/JNBE Jz (f64),(!REX2)
+88: JS Jz (f64),(!REX2)
+89: JNS Jz (f64),(!REX2)
+8a: JP/JPE Jz (f64),(!REX2)
+8b: JNP/JPO Jz (f64),(!REX2)
+8c: JL/JNGE Jz (f64),(!REX2)
+8d: JNL/JGE Jz (f64),(!REX2)
+8e: JLE/JNG Jz (f64),(!REX2)
+8f: JNLE/JG Jz (f64),(!REX2)
# 0x0f 0x90-0x9f
90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
@@ -996,8 +996,8 @@ AVXcode: 4
83: Grp1 Ev,Ib (1A),(es)
# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL,
# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ
-84: CTESTSCC (ev)
-85: CTESTSCC (es) | CTESTSCC (66),(es)
+84: CTESTSCC Eb,Gb (ev)
+85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es)
88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es)
8f: POP2 Bq,Rq (000),(11B),(ev)
a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index eb503f53c319..9cbc1e6057d3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void)
}
/*
- * INVLPG may not properly flush Global entries
- * on these CPUs when PCIDs are enabled.
+ * INVLPG may not properly flush Global entries on
+ * these CPUs. New microcode fixes the issue.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- X86_MATCH_VFM(INTEL_ALDERLAKE, 0),
- X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0),
- X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
- X86_MATCH_VFM(INTEL_RAPTORLAKE, 0),
- X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0),
- X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e),
{}
};
static void setup_pcid(void)
{
+ const struct x86_cpu_id *invlpg_miss_match;
+
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
- if (x86_match_cpu(invlpg_miss_ids)) {
+ invlpg_miss_match = x86_match_cpu(invlpg_miss_ids);
+
+ if (invlpg_miss_match &&
+ boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
pr_info("Incomplete global flushes, disabling PCID");
setup_clear_cpu_cap(X86_FEATURE_PCID);
return;
@@ -640,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start,
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end);
- memblock_phys_free(addr, PMD_SIZE);
- real_end = addr + PMD_SIZE;
+ if (!addr) {
+ pr_warn("Failed to release memory for alloc_low_pages()");
+ real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
+ } else {
+ memblock_phys_free(addr, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+ }
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ff253648706f..d8853afd314b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -967,9 +967,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
- /* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
- nr_pages << PAGE_SHIFT);
+ /*
+ * Special case: add_pages() is called by memremap_pages() for adding device
+ * private pages. Do not bump up max_pfn in the device private path,
+ * because max_pfn changes affect dma_addressing_limited().
+ *
+ * dma_addressing_limited() returning true when max_pfn is the device's
+ * addressable memory can force device drivers to use bounce buffers
+ * and impact their performance negatively:
+ */
+ if (!params->pgmap)
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
return ret;
}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 230f1dee4f09..e0b0ec0f8245 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -109,8 +109,14 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt physical memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
+ /*
+ * Adapt physical memory region size based on available memory,
+ * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the
+ * device BAR space assuming the direct map space is large enough
+ * for creating a ZONE_DEVICE mapping in the direct map corresponding
+ * to the physical BAR address.
+ */
+ if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb))
kaslr_regions[0].size_tb = memory_tb;
/*
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index ac33b2263a43..b922b9fea6b6 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -562,7 +562,7 @@ void __head sme_enable(struct boot_params *bp)
}
RIP_REL_REF(sme_me_mask) = me_mask;
- physical_mask &= ~me_mask;
- cc_vendor = CC_VENDOR_AMD;
+ RIP_REL_REF(physical_mask) &= ~me_mask;
+ RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD;
cc_set_mask(me_mask);
}
diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c
index 3d2f7f0a6ed1..ad3c1feec990 100644
--- a/arch/x86/mm/pat/cpa-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -183,7 +183,7 @@ static int pageattr_test(void)
break;
case 1:
- err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1);
+ err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1);
break;
case 2:
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index feb8cc6a12bf..d721cc19addb 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -984,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
return -EINVAL;
}
-/*
- * track_pfn_copy is called when vma that is covering the pfnmap gets
- * copied through copy_page_range().
- *
- * If the vma has a linear pfn mapping for the entire range, we get the prot
- * from pte and reserve the entire vma range with single reserve_pfn_range call.
- */
-int track_pfn_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long *pfn)
{
+ const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
resource_size_t paddr;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
+ int rc;
- if (vma->vm_flags & VM_PAT) {
- if (get_pat_info(vma, &paddr, &pgprot))
- return -EINVAL;
- /* reserve the whole chunk covered by vma. */
- return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
- }
+ if (!(src_vma->vm_flags & VM_PAT))
+ return 0;
+
+ /*
+ * Duplicate the PAT information for the dst VMA based on the src
+ * VMA.
+ */
+ if (get_pat_info(src_vma, &paddr, &pgprot))
+ return -EINVAL;
+ rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ if (rc)
+ return rc;
+ /* Reservation for the destination VMA succeeded. */
+ vm_flags_set(dst_vma, VM_PAT);
+ *pfn = PHYS_PFN(paddr);
return 0;
}
+void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
+{
+ untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
+ /*
+ * Reservation was freed, any copied page tables will get cleaned
+ * up later, but without getting PAT involved again.
+ */
+}
+
/*
* prot is passed in as a parameter for the new mapping. If the vma has
* a linear pfn mapping for the entire range, or no vma is provided,
@@ -1095,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
}
}
-/*
- * untrack_pfn_clear is called if the following situation fits:
- *
- * 1) while mremapping a pfnmap for a new region, with the old vma after
- * its pfnmap page table has been removed. The new vma has a new pfnmap
- * to the same pfn & cache type with VM_PAT set.
- * 2) while duplicating vm area, the new vma fails to copy the pgtable from
- * old vma.
- */
void untrack_pfn_clear(struct vm_area_struct *vma)
{
vm_flags_clear(vma, VM_PAT);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 44f7b2ea6a07..69ceb967d73e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2422,7 +2422,7 @@ static int __set_pages_np(struct page *page, int numpages)
.pgd = NULL,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS };
/*
@@ -2501,7 +2501,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
.flags = CPA_NO_CHECK_ALIAS,
};
@@ -2544,7 +2544,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
.flags = CPA_NO_CHECK_ALIAS,
};
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 00ffa74d0dd0..8629d90fdcd9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -389,9 +389,9 @@ static void cond_mitigation(struct task_struct *next)
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
/*
- * Avoid user/user BTB poisoning by flushing the branch predictor
- * when switching between processes. This stops one process from
- * doing Spectre-v2 attacks on another.
+ * Avoid user->user BTB/RSB poisoning by flushing them when switching
+ * between processes. This stops one process from doing Spectre-v2
+ * attacks on another.
*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
@@ -624,7 +624,11 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ /*
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
+ * and others are sensitive to the window where mm_cpumask(),
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
+ */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
}
@@ -895,8 +899,16 @@ done:
static bool should_flush_tlb(int cpu, void *data)
{
+ struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
struct flush_tlb_info *info = data;
+ /*
+ * Order the 'loaded_mm' and 'is_lazy' against their
+ * write ordering in switch_mm_irqs_off(). Ensure
+ * 'is_lazy' is at least as new as 'loaded_mm'.
+ */
+ smp_rmb();
+
/* Lazy TLB will get flushed at the next context switch. */
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
return false;
@@ -905,8 +917,15 @@ static bool should_flush_tlb(int cpu, void *data)
if (!info->mm)
return true;
+ /*
+ * While switching, the remote CPU could have state from
+ * either the prev or next mm. Assume the worst and flush.
+ */
+ if (loaded_mm == LOADED_MM_SWITCHING)
+ return true;
+
/* The target mm is loaded, and the CPU is not lazy. */
- if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ if (loaded_mm == info->mm)
return true;
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..ccb2f7703c33 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -41,6 +41,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+#define EMIT5(b1, b2, b3, b4, b5) \
+ do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0)
#define EMIT1_off32(b1, off) \
do { EMIT1(b1); EMIT(off, 4); } while (0)
@@ -637,7 +639,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{
u8 *prog = *pprog;
- if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) {
+ OPTIMIZER_HIDE_VAR(reg);
+ emit_jump(&prog, its_static_thunk(reg), ip);
+ } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
EMIT_LFENCE();
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
@@ -659,7 +664,7 @@ static void emit_return(u8 **pprog, u8 *ip)
{
u8 *prog = *pprog;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (cpu_wants_rethunk()) {
emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
@@ -1412,6 +1417,48 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
#define LOAD_TAIL_CALL_CNT_PTR(stack) \
__LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
+static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip,
+ struct bpf_prog *bpf_prog)
+{
+ u8 *prog = *pprog;
+ u8 *func;
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) {
+ /* The clearing sequence clobbers eax and ecx. */
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x51); /* push rcx */
+ ip += 2;
+
+ func = (u8 *)clear_bhb_loop;
+ ip += x86_call_depth_emit_accounting(&prog, func, ip);
+
+ if (emit_call(&prog, func, ip))
+ return -EINVAL;
+ EMIT1(0x59); /* pop rcx */
+ EMIT1(0x58); /* pop rax */
+ }
+ /* Insert IBHF instruction */
+ if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) &&
+ cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) ||
+ cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) {
+ /*
+ * Add an Indirect Branch History Fence (IBHF). IBHF acts as a
+ * fence preventing branch history from before the fence from
+ * affecting indirect branches after the fence. This is
+ * specifically used in cBPF jitted code to prevent Intra-mode
+ * BHI attacks. The IBHF instruction is designed to be a NOP on
+ * hardware that doesn't need or support it. The REP and REX.W
+ * prefixes are required by the microcode, and they also ensure
+ * that the NOP is unlikely to be used in existing code.
+ *
+ * IBHF is not a valid instruction in 32-bit mode.
+ */
+ EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */
+ }
+ *pprog = prog;
+ return 0;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
@@ -2402,6 +2449,13 @@ emit_jmp:
seen_exit = true;
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
+ if (bpf_prog_was_classic(bpf_prog) &&
+ !capable(CAP_SYS_ADMIN)) {
+ u8 *ip = image + addrs[i - 1];
+
+ if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
+ return -EINVAL;
+ }
if (bpf_prog->aux->exception_boundary) {
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 0f2fe524f60d..b8755cde2419 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -436,7 +436,8 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};
static struct msi_domain_info xen_pci_msi_domain_info = {
- .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS |
+ MSI_FLAG_DEV_SYSFS | MSI_FLAG_NO_MASK,
.ops = &xen_pci_msi_domain_ops,
};
@@ -484,11 +485,6 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
- /*
- * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
- * controlled by the hypervisor.
- */
- pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 63230ff8cf4f..08e76a5ca155 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -27,6 +27,7 @@
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
#include <asm/microcode.h>
+#include <asm/fred.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -231,6 +232,19 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
*/
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+
+ /*
+ * Reinitialize FRED to ensure the FRED MSRs contain the same values
+ * as before hibernation.
+ *
+ * Note, the setup of FRED RSPs requires access to percpu data
+ * structures. Therefore, FRED reinitialization can only occur after
+ * the percpu access pointer (i.e., MSR_GS_BASE) is restored.
+ */
+ if (ctxt->cr4 & X86_CR4_FRED) {
+ cpu_init_fred_exceptions();
+ cpu_init_fred_rsps();
+ }
#else
loadsegment(fs, __KERNEL_PERCPU);
#endif
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 472540aeabc2..08cd913cbd4e 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -10,8 +10,7 @@
#include <assert.h>
#include <unistd.h>
#include <stdarg.h>
-
-#define unlikely(cond) (cond)
+#include <linux/kallsyms.h>
#include <asm/insn.h>
#include <inat.c>
@@ -106,7 +105,7 @@ static void parse_args(int argc, char **argv)
}
}
-#define BUFSIZE 256
+#define BUFSIZE (256 + KSYM_NAME_LEN)
int main(int argc, char **argv)
{
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index b07824500363..ddc144657efa 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -20,6 +20,9 @@
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+/* Do not call this directly. Declared for export type visibility. */
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+
/**
* csum_fold - Fold and invert a 32bit checksum.
* sum: 32bit unfolded sum
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index e80ab7d28117..1b0d95328b2c 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -27,7 +27,6 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
COPY(RIP);
COPY2(EFLAGS, EFL);
COPY2(CS, CSGSFS);
- regs->gp[CS / sizeof(unsigned long)] &= 0xffff;
- regs->gp[CS / sizeof(unsigned long)] |= 3;
+ regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48;
#endif
}
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 4e2b2e2ac9f9..eb91bc5448de 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -69,8 +69,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err,
args->r9, args->r10, args->r11);
}
-static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
- u64 fn, struct tdx_module_args *args)
+static __always_inline int sc_retry_prerr(sc_func_t func,
+ sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
{
u64 sret = sc_retry(func, fn, args);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b4f3784f27e9..0c950bbca309 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -70,6 +70,9 @@ EXPORT_SYMBOL(xen_start_flags);
*/
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
+
static __ref void xen_get_vendor(void)
{
init_cpu_devs();
@@ -465,6 +468,13 @@ int __init arch_xen_unpopulated_init(struct resource **res)
xen_free_unpopulated_pages(1, &pg);
}
+ /*
+ * Account for the region being in the physmap but unpopulated.
+ * The value in xen_released_pages is used by the balloon
+ * driver to know how much of the physmap is unpopulated and
+ * set an accurate initial memory target.
+ */
+ xen_released_pages += xen_extra_mem[i].n_pfns;
/* Zero so region is not also added to the balloon driver. */
xen_extra_mem[i].n_pfns = 0;
}
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 0e3d930bcb89..9d25d9373945 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
#include <linux/export.h>
#include <linux/mm.h>
@@ -123,8 +125,23 @@ static void __init pvh_arch_setup(void)
{
pvh_reserve_extra_memory();
- if (xen_initial_domain())
+ if (xen_initial_domain()) {
xen_add_preferred_consoles();
+
+ /*
+ * Disable usage of CPU idle and frequency drivers: when
+ * running as hardware domain the exposed native ACPI tables
+ * causes idle and/or frequency drivers to attach and
+ * malfunction. It's Xen the entity that controls the idle and
+ * frequency states.
+ *
+ * For unprivileged domains the exposed ACPI tables are
+ * fabricated and don't contain such data.
+ */
+ disable_cpuidle();
+ disable_cpufreq();
+ WARN_ON(xen_set_default_idle());
+ }
}
void __init xen_pvh_init(struct boot_params *boot_params)
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 10c660fae8b3..7237d56a9d3f 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -54,14 +54,20 @@ struct mc_debug_data {
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
static struct mc_debug_data mc_debug_data_early __initdata;
-static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) =
- &mc_debug_data_early;
static struct mc_debug_data __percpu *mc_debug_data_ptr;
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
static struct static_key mc_debug __ro_after_init;
static bool mc_debug_enabled __initdata;
+static struct mc_debug_data * __ref get_mc_debug(void)
+{
+ if (!mc_debug_data_ptr)
+ return &mc_debug_data_early;
+
+ return this_cpu_ptr(mc_debug_data_ptr);
+}
+
static int __init xen_parse_mc_debug(char *arg)
{
mc_debug_enabled = true;
@@ -71,20 +77,16 @@ static int __init xen_parse_mc_debug(char *arg)
}
early_param("xen_mc_debug", xen_parse_mc_debug);
-void mc_percpu_init(unsigned int cpu)
-{
- per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu);
-}
-
static int __init mc_debug_enable(void)
{
unsigned long flags;
+ struct mc_debug_data __percpu *mcdb;
if (!mc_debug_enabled)
return 0;
- mc_debug_data_ptr = alloc_percpu(struct mc_debug_data);
- if (!mc_debug_data_ptr) {
+ mcdb = alloc_percpu(struct mc_debug_data);
+ if (!mcdb) {
pr_err("xen_mc_debug inactive\n");
static_key_slow_dec(&mc_debug);
return -ENOMEM;
@@ -93,7 +95,7 @@ static int __init mc_debug_enable(void)
/* Be careful when switching to percpu debug data. */
local_irq_save(flags);
xen_mc_flush();
- mc_percpu_init(0);
+ mc_debug_data_ptr = mcdb;
local_irq_restore(flags);
pr_info("xen_mc_debug active\n");
@@ -155,7 +157,7 @@ void xen_mc_flush(void)
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
if (static_key_false(&mc_debug)) {
- mcdb = __this_cpu_read(mc_debug_data);
+ mcdb = get_mc_debug();
memcpy(mcdb->entries, b->entries,
b->mcidx * sizeof(struct multicall_entry));
}
@@ -235,7 +237,7 @@ struct multicall_space __xen_mc_entry(size_t args)
ret.mc = &b->entries[b->mcidx];
if (static_key_false(&mc_debug)) {
- struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data);
+ struct mc_debug_data *mcdb = get_mc_debug();
mcdb->caller[b->mcidx] = __builtin_return_address(0);
mcdb->argsz[b->mcidx] = args;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c3db71d96c43..3823e52aef52 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -37,9 +37,6 @@
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
-/* Number of pages released from the initial allocation. */
-unsigned long xen_released_pages;
-
/* Memory map would allow PCI passthrough. */
bool xen_pv_pci_possible;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 6863d3da7dec..7ea57f728b89 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -305,7 +305,6 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle)
return rc;
xen_pmu_init(cpu);
- mc_percpu_init(cpu);
/*
* Why is this a BUG? If the hypercall fails then everything can be
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 63c13a2ccf55..25e318ef27d6 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -261,9 +261,6 @@ void xen_mc_callback(void (*fn)(void *), void *data);
*/
struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
-/* Do percpu data initialization for multicalls. */
-void mc_percpu_init(unsigned int cpu);
-
extern bool is_xen_pmu;
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);