summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig31
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/sev.c7
-rw-r--r--arch/x86/boot/cpuflags.c13
-rw-r--r--arch/x86/boot/genimage.sh5
-rw-r--r--arch/x86/entry/entry.S10
-rw-r--r--arch/x86/entry/entry_64.S20
-rw-r--r--arch/x86/events/amd/ibs.c3
-rw-r--r--arch/x86/events/core.c2
-rw-r--r--arch/x86/events/intel/core.c4
-rw-r--r--arch/x86/hyperv/irqdomain.c4
-rw-r--r--arch/x86/include/asm/alternative.h32
-rw-r--r--arch/x86/include/asm/cpu.h12
-rw-r--r--arch/x86/include/asm/cpufeatures.h12
-rw-r--r--arch/x86/include/asm/entry-common.h7
-rw-r--r--arch/x86/include/asm/irqflags.h4
-rw-r--r--arch/x86/include/asm/kexec.h18
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h23
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/mwait.h19
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/nospec-branch.h79
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h3
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/virtext.h10
-rw-r--r--arch/x86/include/asm/xen/hypercall.h5
-rw-r--r--arch/x86/kernel/alternative.c248
-rw-r--r--arch/x86/kernel/cpu/amd.c61
-rw-r--r--arch/x86/kernel/cpu/bugs.c497
-rw-r--r--arch/x86/kernel/cpu/common.c163
-rw-r--r--arch/x86/kernel/cpu/hygon.c3
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c28
-rw-r--r--arch/x86/kernel/cpu/mce/core.c8
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/scattered.c3
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c2
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/i8253.c3
-rw-r--r--arch/x86/kernel/ioport.c13
-rw-r--r--arch/x86/kernel/kprobes/core.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c45
-rw-r--r--arch/x86/kernel/module.c16
-rw-r--r--arch/x86/kernel/nmi.c42
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/reboot.c53
-rw-r--r--arch/x86/kernel/sev-shared.c18
-rw-r--r--arch/x86/kernel/sev.c11
-rw-r--r--arch/x86/kernel/static_call.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/kvm/cpuid.c9
-rw-r--r--arch/x86/kvm/hyperv.c10
-rw-r--r--arch/x86/kvm/lapic.c63
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/svm/avic.c60
-rw-r--r--arch/x86/kvm/svm/sev.c4
-rw-r--r--arch/x86/kvm/svm/svm.c60
-rw-r--r--arch/x86/kvm/svm/vmenter.S15
-rw-r--r--arch/x86/kvm/trace.h9
-rw-r--r--arch/x86/kvm/vmx/nested.c26
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c28
-rw-r--r--arch/x86/kvm/vmx/vmx.c182
-rw-r--r--arch/x86/kvm/vmx/vmx.h31
-rw-r--r--arch/x86/kvm/x86.c89
-rw-r--r--arch/x86/kvm/x86.h12
-rw-r--r--arch/x86/kvm/xen.c15
-rw-r--r--arch/x86/lib/retpoline.S39
-rw-r--r--arch/x86/mm/extable.c5
-rw-r--r--arch/x86/mm/init.c9
-rw-r--r--arch/x86/mm/init_64.c33
-rw-r--r--arch/x86/mm/kaslr.c10
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c60
-rw-r--r--arch/x86/tools/insn_decoder_test.c5
-rw-r--r--arch/x86/um/asm/checksum.h3
-rw-r--r--arch/x86/um/os-Linux/mcontext.c3
80 files changed, 1904 insertions, 520 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48ab6e8f2d1d..c7dfe2a5f162 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,7 +124,7 @@ config X86
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_GENERAL_HUGETLB
- select ARCH_WANT_HUGE_PMD_SHARE
+ select ARCH_WANT_HUGE_PMD_SHARE if X86_64
select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
@@ -2575,6 +2575,35 @@ config MITIGATION_SPECTRE_BHI
indirect branches.
See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+config MITIGATION_ITS
+ bool "Enable Indirect Target Selection mitigation"
+ depends on CPU_SUP_INTEL && X86_64
+ depends on RETPOLINE && RETHUNK
+ default y
+ help
+ Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in
+ BPU on some Intel CPUs that may allow Spectre V2 style attacks. If
+ disabled, mitigation cannot be enabled via cmdline.
+ See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
+
+config MITIGATION_TSA
+ bool "Mitigate Transient Scheduler Attacks"
+ depends on CPU_SUP_AMD
+ default y
+ help
+ Enable mitigation for Transient Scheduler Attacks. TSA is a hardware
+ security vulnerability on AMD CPUs which can lead to forwarding of
+ invalid info to subsequent instructions and thus can affect their
+ timing and thereby cause a leakage.
+
+config MITIGATION_VMSCAPE
+ bool "Mitigate VMSCAPE"
+ depends on KVM
+ default y
+ help
+ Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security
+ vulnerability on Intel and AMD CPUs that may allow a guest to do
+ Spectre v2 style attacks on userspace hypervisor.
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3419ffa2a350..a4764ec92373 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -43,7 +43,7 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 3c5d5c97f8f7..4f61d48f2575 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -164,6 +164,13 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
*/
if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ /*
+ * If validating memory (making it private) and affected by the
+ * cache-coherency vulnerability, perform the cache eviction mitigation.
+ */
+ if (op == SNP_PAGE_STATE_PRIVATE && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO))
+ sev_evict_cache((void *)paddr, 1);
}
void snp_set_page_private(unsigned long paddr)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a83d67ec627d..aa4943432dcf 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -124,5 +124,18 @@ void get_cpuflags(void)
cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
&cpu.flags[1]);
}
+
+ if (max_amd_level >= 0x8000001f) {
+ u32 ebx;
+
+ /*
+ * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in
+ * the virtualization flags entry (word 8) and set by
+ * scattered.c, so the bit needs to be explicitly set.
+ */
+ cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored);
+ if (ebx & BIT(31))
+ set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags);
+ }
}
}
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index c9299aeb7333..3882ead513f7 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -22,6 +22,7 @@
# This script requires:
# bash
# syslinux
+# genisoimage
# mtools (for fdimage* and hdimage)
# edk2/OVMF (for hdimage)
#
@@ -251,7 +252,9 @@ geniso() {
cp "$isolinux" "$ldlinux" "$tmp_dir"
cp "$FBZIMAGE" "$tmp_dir"/linux
echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
- cp "${FDINITRDS[@]}" "$tmp_dir"/
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ fi
genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
-quiet -o "$FIMAGE" -b isolinux.bin \
-c boot.cat -no-emul-boot -boot-load-size 4 \
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index f4419afc7147..057eeb4eda4e 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -16,7 +16,7 @@
SYM_FUNC_START(entry_ibpb)
movl $MSR_IA32_PRED_CMD, %ecx
- movl $PRED_CMD_IBPB, %eax
+ movl _ASM_RIP(x86_pred_cmd), %eax
xorl %edx, %edx
wrmsr
@@ -31,20 +31,20 @@ EXPORT_SYMBOL_GPL(entry_ibpb);
/*
* Define the VERW operand that is disguised as entry code so that
- * it can be referenced with KPTI enabled. This ensure VERW can be
+ * it can be referenced with KPTI enabled. This ensures VERW can be
* used late in exit-to-user path after page tables are switched.
*/
.pushsection .entry.text, "ax"
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_START_NOALIGN(mds_verw_sel)
+SYM_CODE_START_NOALIGN(x86_verw_sel)
UNWIND_HINT_EMPTY
ANNOTATE_NOENDBR
.word __KERNEL_DS
.align L1_CACHE_BYTES, 0xcc
-SYM_CODE_END(mds_verw_sel);
+SYM_CODE_END(x86_verw_sel);
/* For KVM */
-EXPORT_SYMBOL_GPL(mds_verw_sel);
+EXPORT_SYMBOL_GPL(x86_verw_sel);
.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a114338380a6..4f5968810450 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1559,7 +1559,9 @@ SYM_CODE_END(rewind_stack_and_make_dead)
* ORC to unwind properly.
*
* The alignment is for performance and not for safety, and may be safely
- * refactored in the future if needed.
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
*/
SYM_FUNC_START(clear_bhb_loop)
push %rbp
@@ -1569,10 +1571,22 @@ SYM_FUNC_START(clear_bhb_loop)
call 1f
jmp 5f
.align 64, 0xcc
+ /*
+ * Shift instructions so that the RET is in the upper half of the
+ * cacheline and don't take the slowpath to its_return_thunk.
+ */
+ .skip 32 - (.Lret1 - 1f), 0xcc
ANNOTATE_INTRA_FUNCTION_CALL
1: call 2f
- RET
+.Lret1: RET
.align 64, 0xcc
+ /*
+ * As above shift instructions for RET at .Lret2 as well.
+ *
+ * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+ * but some Clang versions (e.g. 18) don't like this.
+ */
+ .skip 32 - 18, 0xcc
2: movl $5, %eax
3: jmp 4f
nop
@@ -1580,7 +1594,7 @@ SYM_FUNC_START(clear_bhb_loop)
jnz 3b
sub $1, %ecx
jnz 1b
- RET
+.Lret2: RET
5: lfence
pop %rbp
RET
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 37cbbc5c659a..8c385e231e07 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -1216,7 +1216,8 @@ static __init int perf_ibs_op_init(void)
if (ibs_caps & IBS_CAPS_OPCNTEXT) {
perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK;
- perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK;
+ perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK |
+ IBS_OP_CUR_CNT_EXT_MASK);
}
if (ibs_caps & IBS_CAPS_ZEN4)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index fa0744732444..a8732dd9fedb 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -623,7 +623,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == event->pmu->type)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
- if (!event->attr.freq && x86_pmu.limit_period) {
+ if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) {
s64 left = event->attr.sample_period;
x86_pmu.limit_period(event, &left);
if (left > event->attr.sample_period)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 46677cad3a39..2cb5b1f715b6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2703,7 +2703,7 @@ static void intel_pmu_read_event(struct perf_event *event)
if (pmu_enabled)
intel_pmu_disable_all();
- if (is_topdown_event(event))
+ if (is_topdown_count(event))
static_call(intel_pmu_update_topdown_event)(event);
else
intel_pmu_drain_pebs_buffer();
@@ -4177,7 +4177,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
arr[pebs_enable] = (struct perf_guest_switch_msr){
.msr = MSR_IA32_PEBS_ENABLE,
.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
- .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
+ .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable,
};
if (arr[pebs_enable].host) {
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 42c70d28ef27..865ae4be233b 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -192,7 +192,6 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct pci_dev *dev;
struct hv_interrupt_entry out_entry, *stored_entry;
struct irq_cfg *cfg = irqd_cfg(data);
- const cpumask_t *affinity;
int cpu;
u64 status;
@@ -204,8 +203,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- affinity = irq_data_get_effective_affinity_mask(data);
- cpu = cpumask_first_and(affinity, cpu_online_mask);
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
if (data->chip_data) {
/*
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 9542c582d546..fd3b73017738 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#include <asm/bug.h>
#define ALTINSTR_FLAG_INV (1 << 15)
#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
@@ -81,6 +82,37 @@ extern void apply_ibt_endbr(s32 *start, s32 *end);
struct module;
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_init_mod(struct module *mod);
+extern void its_fini_mod(struct module *mod);
+extern void its_free_mod(struct module *mod);
+extern u8 *its_static_thunk(int reg);
+#else /* CONFIG_MITIGATION_ITS */
+static inline void its_init_mod(struct module *mod) { }
+static inline void its_fini_mod(struct module *mod) { }
+static inline void its_free_mod(struct module *mod) { }
+static inline u8 *its_static_thunk(int reg)
+{
+ WARN_ONCE(1, "ITS not compiled in");
+
+ return NULL;
+}
+#endif
+
+#if defined(CONFIG_RETHUNK) && defined(CONFIG_OBJTOOL)
+extern bool cpu_wants_rethunk(void);
+extern bool cpu_wants_rethunk_at(void *addr);
+#else
+static __always_inline bool cpu_wants_rethunk(void)
+{
+ return false;
+}
+static __always_inline bool cpu_wants_rethunk_at(void *addr)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 37639a2d9c34..c976bbf909e0 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -98,4 +98,16 @@ extern u64 x86_read_arch_cap_msr(void);
extern struct cpumask cpus_stop_mask;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5709cbba28e7..f86e100cf56b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -230,6 +230,7 @@
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_COHERENCY_SFW_NO ( 8*32+ 5) /* "" SNP cache coherency software work around not needed */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
@@ -429,6 +430,7 @@
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+#define X86_FEATURE_VERW_CLEAR (20*32+ 5) /* "" The memory form of VERW mitigates TSA */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
@@ -445,6 +447,12 @@
#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 5) /* "" Use thunk for indirect branches in lower half of cacheline */
+
+#define X86_FEATURE_TSA_SQ_NO (21*32+11) /* "" AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO (21*32+12) /* "" AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* "" Clear CPU buffers using VERW before VMRUN */
+#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
/*
* BUG word(s)
@@ -495,4 +503,8 @@
#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */
+#define X86_BUG_ITS X86_BUG(1*32 + 5) /* CPU is affected by Indirect Target Selection */
+#define X86_BUG_ITS_NATIVE_ONLY X86_BUG(1*32 + 6) /* CPU is affected by ITS, VMX is not affected */
+#define X86_BUG_TSA X86_BUG(1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
+#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index ebdf5c97f53a..7dedda82f499 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -83,6 +83,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
* 8 (ia32) bits.
*/
choose_random_kstack_offset(rdtsc());
+
+ /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) &&
+ this_cpu_read(x86_ibpb_exit_to_user)) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(x86_ibpb_exit_to_user, false);
+ }
}
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 7793e52d6237..b8ec2be2bbe4 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -47,13 +47,13 @@ static __always_inline void native_irq_enable(void)
static inline __cpuidle void native_safe_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
static inline __cpuidle void native_halt(void)
{
- mds_idle_clear_cpu_buffers();
+ x86_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index e2e1ec99c999..256eee99afc8 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -16,7 +16,6 @@
# define PAGES_NR 4
#endif
-# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
#ifndef __ASSEMBLY__
@@ -45,6 +44,7 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -59,6 +59,9 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
+/* Allocate one page for the pdp and the second for the code */
+# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
+
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
@@ -143,19 +146,6 @@ struct kimage_arch {
};
#else
struct kimage_arch {
- /*
- * This is a kimage control page, as it must not overlap with either
- * source or destination address ranges.
- */
- pgd_t *pgd;
- /*
- * The virtual mapping of the control code page itself is used only
- * during the transition, while the current kernel's pages are all
- * in place. Thus the intermediate page table pages used to map it
- * are not control pages, but instead just normal pages obtained
- * with get_zeroed_page(). And have to be tracked (below) so that
- * they can be freed.
- */
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index abc07d004589..c068565fe954 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -99,7 +99,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9de3db4a32f8..d0229323ca63 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -677,6 +677,7 @@ struct kvm_vcpu_arch {
u32 pkru;
u32 hflags;
u64 efer;
+ u64 host_debugctl;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool load_eoi_exitmap_pending;
@@ -1455,6 +1456,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
}
+enum kvm_x86_run_flags {
+ KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0),
+ KVM_RUN_LOAD_GUEST_DR6 = BIT(1),
+ KVM_RUN_LOAD_DEBUGCTL = BIT(2),
+};
+
struct kvm_x86_ops {
const char *name;
@@ -1478,6 +1485,12 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ /*
+ * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to
+ * match the host's value even while the guest is active.
+ */
+ const u64 HOST_OWNED_DEBUGCTL;
+
void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -1526,7 +1539,8 @@ struct kvm_x86_ops {
void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+ u64 run_flags);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1546,10 +1560,12 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+
+ const bool x2apic_icr_is_split;
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
- void (*hwapic_isr_update)(int isr);
+ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
@@ -1585,8 +1601,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
-
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
/*
@@ -2054,7 +2068,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5ffcfcc41282..afd65c815043 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -185,6 +185,14 @@
* VERW clears CPU Register
* File.
*/
+#define ARCH_CAP_ITS_NO BIT_ULL(62) /*
+ * Not susceptible to
+ * Indirect Target Selection.
+ * This bit is not set by
+ * HW, but is synthesized by
+ * VMMs for guests to know
+ * their affected status.
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
@@ -371,6 +379,7 @@
#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12)
#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG BIT(15)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 3a8fdf881313..d938792c11d9 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -44,8 +44,6 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
- mds_idle_clear_cpu_buffers();
-
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -80,7 +78,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
unsigned long ecx)
{
- /* No MDS buffer clear as this is AMD/HYGON only */
+ /* No need for TSA buffer clearing on AMD */
/* "mwaitx %eax, %ebx, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xfb;"
@@ -89,7 +87,7 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
- mds_idle_clear_cpu_buffers();
+
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -107,6 +105,11 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
@@ -115,9 +118,13 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
- __mwait(eax, ecx);
+ if (need_resched())
+ goto out;
+
+ __mwait(eax, ecx);
}
+
+out:
current_clr_polling();
}
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5c5f1e56c404..6f3d145670a9 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -59,6 +59,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *);
void unregister_nmi_handler(unsigned int, const char *);
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler);
+
void stop_nmi(void);
void restart_nmi(void);
void local_touch_nmi(void);
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index daf58a96e0a7..818a5913f219 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -119,9 +119,8 @@
.endm
/*
- * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
- * to the retpoline thunk with a CS prefix when the register requires
- * a RAX prefix byte to encode. Also see apply_retpolines().
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
*/
.macro __CS_PREFIX reg:req
.irp rs,r8,r9,r10,r11,r12,r13,r14,r15
@@ -203,27 +202,33 @@
.endm
/*
- * Macro to execute VERW instruction that mitigate transient data sampling
- * attacks such as MDS. On affected systems a microcode update overloaded VERW
- * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
- *
+ * Macro to execute VERW insns that mitigate transient data sampling
+ * attacks such as MDS or TSA. On affected systems a microcode update
+ * overloaded VERW insns to also clear the CPU buffers. VERW clobbers
+ * CFLAGS.ZF.
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
-.macro CLEAR_CPU_BUFFERS
- ALTERNATIVE "jmp .Lskip_verw_\@", "", X86_FEATURE_CLEAR_CPU_BUF
+.macro __CLEAR_CPU_BUFFERS feature
+ ALTERNATIVE "jmp .Lskip_verw_\@", "", \feature
#ifdef CONFIG_X86_64
- verw mds_verw_sel(%rip)
+ verw x86_verw_sel(%rip)
#else
/*
* In 32bit mode, the memory operand must be a %cs reference. The data
* segments may not be usable (vm86 mode), and the stack segment may not
* be flat (ESPFIX32).
*/
- verw %cs:mds_verw_sel
+ verw %cs:x86_verw_sel
#endif
.Lskip_verw_\@:
.endm
+#define CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF
+
+#define VM_CLEAR_CPU_BUFFERS \
+ __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM
+
#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
@@ -245,8 +250,12 @@
_ASM_PTR " 999b\n\t" \
".popsection\n\t"
+#define ITS_THUNK_SIZE 64
+
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
+typedef u8 its_thunk_t[ITS_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
+extern its_thunk_t __x86_indirect_its_thunk_array[];
#ifdef CONFIG_RETHUNK
extern void __x86_return_thunk(void);
@@ -254,6 +263,12 @@ extern void __x86_return_thunk(void);
static inline void __x86_return_thunk(void) {}
#endif
+#ifdef CONFIG_MITIGATION_ITS
+extern void its_return_thunk(void);
+#else
+static inline void its_return_thunk(void) {}
+#endif
+
extern void retbleed_return_thunk(void);
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
@@ -281,19 +296,22 @@ extern void (*x86_return_thunk)(void);
#ifdef CONFIG_X86_64
/*
+ * Emits a conditional CS prefix that is compatible with
+ * -mindirect-branch-cs-prefix.
+ */
+#define __CS_PREFIX(reg) \
+ ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\rs," reg "\n" \
+ ".byte 0x2e\n" \
+ ".endif\n" \
+ ".endr\n"
+
+/*
* Inline asm uses the %V modifier which is only in newer GCC
* which is ensured when CONFIG_RETPOLINE is defined.
*/
-# define CALL_NOSPEC \
- ALTERNATIVE_2( \
- ANNOTATE_RETPOLINE_SAFE \
- "call *%[thunk_target]\n", \
- "call __x86_indirect_thunk_%V[thunk_target]\n", \
- X86_FEATURE_RETPOLINE, \
- "lfence;\n" \
- ANNOTATE_RETPOLINE_SAFE \
- "call *%[thunk_target]\n", \
- X86_FEATURE_RETPOLINE_LFENCE)
+#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \
+ "call __x86_indirect_thunk_%V[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
@@ -376,6 +394,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
extern u64 x86_pred_cmd;
+DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user);
+
static inline void indirect_branch_prediction_barrier(void)
{
alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
@@ -415,24 +435,24 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
-extern u16 mds_verw_sel;
+extern u16 x86_verw_sel;
#include <asm/segment.h>
/**
- * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ * x86_clear_cpu_buffers - Buffer clearing support for different x86 CPU vulns
*
* This uses the otherwise unused and obsolete VERW instruction in
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
-static __always_inline void mds_clear_cpu_buffers(void)
+static __always_inline void x86_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
@@ -449,14 +469,15 @@ static __always_inline void mds_clear_cpu_buffers(void)
}
/**
- * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
+ * and TSA vulnerabilities.
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static inline void mds_idle_clear_cpu_buffers(void)
+static __always_inline void x86_idle_clear_cpu_buffers(void)
{
- if (static_branch_likely(&mds_idle_clear))
- mds_clear_cpu_buffers();
+ if (static_branch_likely(&cpu_buf_idle_clear))
+ x86_clear_cpu_buffers();
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 4d810b9478a4..fa3576ef19da 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -456,6 +456,7 @@ struct pebs_xmm {
*/
#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
+#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52)
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6c7f7c526450..96d95d1ed15d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -40,6 +40,9 @@ static inline bool pgtable_l5_enabled(void)
#define pgtable_l5_enabled() 0
#endif /* CONFIG_X86_5LEVEL */
+#define ARCH_PAGE_TABLE_SYNC_MASK \
+ (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
+
extern unsigned int pgdir_shift;
extern unsigned int ptrs_per_p4d;
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 2551baec927d..d9a38d379d18 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,8 +25,9 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
-typedef void crash_vmclear_fn(void);
-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+typedef void (cpu_emergency_virt_cb)(void);
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 724ce44809ed..1b683bf71d14 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -70,16 +70,6 @@ static inline void __cpu_emergency_vmxoff(void)
cpu_vmxoff();
}
-/** Disable VMX if it is supported and enabled on the current CPU
- */
-static inline void cpu_emergency_vmxoff(void)
-{
- if (cpu_has_vmx())
- __cpu_emergency_vmxoff();
-}
-
-
-
/*
* SVM functions:
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9b4eddd5833a..a54c4d3633e9 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -94,12 +94,13 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func);
#ifdef MODULE
#define __ADDRESSABLE_xen_hypercall
#else
-#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall)
+#define __ADDRESSABLE_xen_hypercall \
+ __stringify(.global STATIC_CALL_KEY(xen_hypercall);)
#endif
#define __HYPERCALL \
__ADDRESSABLE_xen_hypercall \
- "call __SCT__xen_hypercall"
+ __stringify(call STATIC_CALL_TRAMP(xen_hypercall))
#define __HYPERCALL_ENTRY(x) "a" (x)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 69f85e274611..3bc6be6404d2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,6 +18,8 @@
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <linux/sync_core.h>
+#include <linux/moduleloader.h>
+#include <linux/cleanup.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -30,6 +32,8 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
+#include <asm/set_memory.h>
int __read_mostly alternatives_patched;
@@ -396,6 +400,212 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
+static void *its_page;
+static unsigned int its_offset;
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int i = 0;
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* jmp *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk;
+}
+
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ set_memory_ro((unsigned long)page, 1);
+ set_memory_x((unsigned long)page, 1);
+ }
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->its_num_pages; i++) {
+ void *page = mod->its_page_array[i];
+ module_memfree(page);
+ }
+ kfree(mod->its_page_array);
+}
+
+DEFINE_FREE(its_execmem, void *, if (_T) module_memfree(_T));
+
+static void *its_alloc(void)
+{
+ void *page __free(its_execmem) = module_alloc(PAGE_SIZE);
+
+ if (!page)
+ return NULL;
+
+ if (its_mod) {
+ void *tmp = krealloc(its_mod->its_page_array,
+ (its_mod->its_num_pages+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ its_mod->its_page_array = tmp;
+ its_mod->its_page_array[its_mod->its_num_pages++] = page;
+ }
+
+ return no_free_ptr(page);
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
+
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ set_memory_rw((unsigned long)its_page, 1);
+ thunk = its_init_thunk(thunk, reg);
+ set_memory_ro((unsigned long)its_page, 1);
+ set_memory_x((unsigned long)its_page, 1);
+
+ return thunk;
+}
+#else /* CONFIG_MODULES */
+
+static void *its_allocate_thunk(int reg)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ call_dest,
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ jmp_dest,
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+
+u8 *its_static_thunk(int reg)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+ return thunk;
+}
+
+#endif
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -466,6 +676,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
bytes[i++] = 0xe8; /* LFENCE */
}
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
ret = emit_indirect(op, reg, bytes + i);
if (ret < 0)
return ret;
@@ -537,6 +756,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
#ifdef CONFIG_RETHUNK
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -552,13 +786,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
int i = 0;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
- if (x86_return_thunk == __x86_return_thunk)
- return -1;
-
+ /* Patch the custom return thunks... */
+ if (cpu_wants_rethunk_at(addr)) {
i = JMP32_INSN_SIZE;
__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
} else {
+ /* ... or patch them out if not needed. */
bytes[i++] = RET_INSN_OPCODE;
}
@@ -914,6 +1147,8 @@ static noinline void __init int3_selftest(void)
void __init alternative_instructions(void)
{
+ u64 ibt;
+
int3_selftest();
/*
@@ -951,6 +1186,9 @@ void __init alternative_instructions(void)
*/
paravirt_set_cap();
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save();
+
/*
* First patch paravirt functions, such that we overwrite the indirect
* call with the direct call.
@@ -972,6 +1210,8 @@ void __init alternative_instructions(void)
apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ ibt_restore(ibt);
+
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9ac93b4ba67b..2d71c329b347 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -553,6 +553,64 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
#endif
}
+static bool amd_check_tsa_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ union zen_patch_rev p;
+ u32 min_rev = 0;
+
+ p.ext_fam = c->x86 - 0xf;
+ p.model = c->x86_model;
+ p.ext_model = c->x86_model >> 4;
+ p.stepping = c->x86_stepping;
+ /* reserved bits are expected to be 0 in test below */
+ p.__reserved = 0;
+
+ if (c->x86 == 0x19) {
+ switch (p.ucode_rev >> 8) {
+ case 0xa0011: min_rev = 0x0a0011d7; break;
+ case 0xa0012: min_rev = 0x0a00123b; break;
+ case 0xa0082: min_rev = 0x0a00820d; break;
+ case 0xa1011: min_rev = 0x0a10114c; break;
+ case 0xa1012: min_rev = 0x0a10124c; break;
+ case 0xa1081: min_rev = 0x0a108109; break;
+ case 0xa2010: min_rev = 0x0a20102e; break;
+ case 0xa2012: min_rev = 0x0a201211; break;
+ case 0xa4041: min_rev = 0x0a404108; break;
+ case 0xa5000: min_rev = 0x0a500012; break;
+ case 0xa6012: min_rev = 0x0a60120a; break;
+ case 0xa7041: min_rev = 0x0a704108; break;
+ case 0xa7052: min_rev = 0x0a705208; break;
+ case 0xa7080: min_rev = 0x0a708008; break;
+ case 0xa70c0: min_rev = 0x0a70c008; break;
+ case 0xaa002: min_rev = 0x0aa00216; break;
+ default:
+ pr_debug("%s: ucode_rev: 0x%x, current revision: 0x%x\n",
+ __func__, p.ucode_rev, c->microcode);
+ return false;
+ }
+ }
+
+ if (!min_rev)
+ return false;
+
+ return c->microcode >= min_rev;
+}
+
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (c->x86 == 0x19) {
+ if (amd_check_tsa_microcode())
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+ }
+}
+
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -620,6 +678,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ tsa_init(c);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -663,6 +723,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
if (!(msr & MSR_K7_HWCR_SMMLOCK))
goto clear_sev;
+
return;
clear_all:
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7d73b5311551..ff8965bce6c9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -48,6 +48,9 @@ static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
static void __init gds_select_mitigation(void);
static void __init srso_select_mitigation(void);
+static void __init its_select_mitigation(void);
+static void __init tsa_select_mitigation(void);
+static void __init vmscape_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -57,6 +60,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+/*
+ * Set when the CPU has run a potentially malicious guest. An IBPB will
+ * be needed to before running userspace. That IBPB will flush the branch
+ * predictor content.
+ */
+DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
+
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
EXPORT_SYMBOL_GPL(x86_pred_cmd);
@@ -66,6 +77,13 @@ static DEFINE_MUTEX(spec_ctrl_mutex);
void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+static void __init set_return_thunk(void *thunk)
+{
+ x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
+}
+
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
static void update_spec_ctrl(u64 val)
{
@@ -112,9 +130,9 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-/* Control MDS CPU buffer clear before idling (halt, mwait) */
-DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
-EXPORT_SYMBOL_GPL(mds_idle_clear);
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
/*
* Controls whether l1d flush based mitigations are enabled,
@@ -174,6 +192,9 @@ void __init cpu_select_mitigations(void)
*/
srso_select_mitigation();
gds_select_mitigation();
+ its_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
}
/*
@@ -434,7 +455,7 @@ static void __init mmio_select_mitigation(void)
* is required irrespective of SMT state.
*/
if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
/*
* Check if the system has the right microcode.
@@ -1081,7 +1102,7 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_UNRET);
if (IS_ENABLED(CONFIG_RETHUNK))
- x86_return_thunk = retbleed_return_thunk;
+ set_return_thunk(retbleed_return_thunk);
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1143,6 +1164,116 @@ do_cmd_auto:
}
#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+enum its_mitigation_cmd {
+ ITS_CMD_OFF,
+ ITS_CMD_ON,
+ ITS_CMD_VMEXIT,
+};
+
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+};
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+};
+
+static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS;
+
+static enum its_mitigation_cmd its_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF;
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_cmd = ITS_CMD_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_cmd = ITS_CMD_ON;
+ } else if (!strcmp(str, "force")) {
+ its_cmd = ITS_CMD_ON;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_cmd = ITS_CMD_VMEXIT;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ enum its_mitigation_cmd cmd = its_cmd;
+
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ /* Exit early to avoid irrelevant warnings */
+ if (cmd == ITS_CMD_OFF) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (spectre_v2_enabled == SPECTRE_V2_NONE) {
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (!IS_ENABLED(CONFIG_RETPOLINE) || !IS_ENABLED(CONFIG_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ goto out;
+ }
+
+ switch (cmd) {
+ case ITS_CMD_OFF:
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case ITS_CMD_VMEXIT:
+ if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ goto out;
+ }
+ fallthrough;
+ case ITS_CMD_ON:
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ }
+out:
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
@@ -1553,7 +1684,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
rrsba_disabled = true;
}
-static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
{
/*
* Similar to context switches, there are two types of RSB attacks
@@ -1577,27 +1708,30 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
*/
switch (mode) {
case SPECTRE_V2_NONE:
- return;
+ break;
- case SPECTRE_V2_EIBRS_LFENCE:
case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
}
- return;
+ break;
- case SPECTRE_V2_EIBRS_RETPOLINE:
case SPECTRE_V2_RETPOLINE:
case SPECTRE_V2_LFENCE:
case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
- return;
- }
+ break;
- pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
- dump_stack();
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
@@ -1653,10 +1787,11 @@ static void __init bhi_select_mitigation(void)
return;
}
- if (spec_ctrl_bhi_dis())
+ if (!IS_ENABLED(CONFIG_X86_64))
return;
- if (!IS_ENABLED(CONFIG_X86_64))
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
return;
/* Mitigate KVM by default */
@@ -1822,10 +1957,7 @@ static void __init spectre_v2_select_mitigation(void)
*
* FIXME: Is this pointless for retbleed-affected AMD?
*/
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
-
- spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+ spectre_v2_select_rsb_mitigation(mode);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
@@ -1907,10 +2039,10 @@ static void update_mds_branch_idle(void)
return;
if (sched_smt_active()) {
- static_branch_enable(&mds_idle_clear);
+ static_branch_enable(&cpu_buf_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
- static_branch_disable(&mds_idle_clear);
+ static_branch_disable(&cpu_buf_idle_clear);
}
}
@@ -1918,60 +2050,92 @@ static void update_mds_branch_idle(void)
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
-void cpu_bugs_smt_update(void)
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_FULL : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
{
- mutex_lock(&spec_ctrl_mutex);
+ if (!str)
+ return -EINVAL;
- if (sched_smt_active() && unprivileged_ebpf_enabled() &&
- spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
- pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
- switch (spectre_v2_user_stibp) {
- case SPECTRE_V2_USER_NONE:
- break;
- case SPECTRE_V2_USER_STRICT:
- case SPECTRE_V2_USER_STRICT_PREFERRED:
- update_stibp_strict();
- break;
- case SPECTRE_V2_USER_PRCTL:
- case SPECTRE_V2_USER_SECCOMP:
- update_indir_branch_cond();
- break;
- }
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
- switch (mds_mitigation) {
- case MDS_MITIGATION_FULL:
- case MDS_MITIGATION_VMWERV:
- if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
- pr_warn_once(MDS_MSG_SMT);
- update_mds_branch_idle();
- break;
- case MDS_MITIGATION_OFF:
- break;
+static void __init tsa_select_mitigation(void)
+{
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (cpu_mitigations_off() || !boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ return;
}
- switch (taa_mitigation) {
- case TAA_MITIGATION_VERW:
- case TAA_MITIGATION_UCODE_NEEDED:
- if (sched_smt_active())
- pr_warn_once(TAA_MSG_SMT);
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
break;
- case TAA_MITIGATION_TSX_DISABLED:
- case TAA_MITIGATION_OFF:
+
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
break;
- }
- switch (mmio_mitigation) {
- case MMIO_MITIGATION_VERW:
- case MMIO_MITIGATION_UCODE_NEEDED:
- if (sched_smt_active())
- pr_warn_once(MMIO_MSG_SMT);
+ case TSA_MITIGATION_UCODE_NEEDED:
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ goto out;
+
+ pr_notice("Forcing mitigation on in a VM\n");
+
+ /*
+ * On the off-chance that microcode has been updated
+ * on the host, enable the mitigation in the guest just
+ * in case.
+ */
+ fallthrough;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
break;
- case MMIO_MITIGATION_OFF:
+ default:
break;
}
- mutex_unlock(&spec_ctrl_mutex);
+out:
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
}
#undef pr_fmt
@@ -2591,10 +2755,10 @@ static void __init srso_select_mitigation(void)
if (boot_cpu_data.x86 == 0x19) {
setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
- x86_return_thunk = srso_alias_return_thunk;
+ set_return_thunk(srso_alias_return_thunk);
} else {
setup_force_cpu_cap(X86_FEATURE_SRSO);
- x86_return_thunk = srso_return_thunk;
+ set_return_thunk(srso_return_thunk);
}
srso_mitigation = SRSO_MITIGATION_SAFE_RET;
} else {
@@ -2663,8 +2827,168 @@ pred_cmd:
}
#undef pr_fmt
+#define pr_fmt(fmt) "VMSCAPE: " fmt
+
+enum vmscape_mitigations {
+ VMSCAPE_MITIGATION_NONE,
+ VMSCAPE_MITIGATION_AUTO,
+ VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+ VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+static const char * const vmscape_strings[] = {
+ [VMSCAPE_MITIGATION_NONE] = "Vulnerable",
+ /* [VMSCAPE_MITIGATION_AUTO] */
+ [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace",
+ [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT",
+};
+
+static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE;
+
+static int __init vmscape_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ } else if (!strcmp(str, "ibpb")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+ vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
+ } else {
+ pr_err("Ignoring unknown vmscape=%s option.\n", str);
+ }
+
+ return 0;
+}
+early_param("vmscape", vmscape_parse_cmdline);
+
+static void __init vmscape_select_mitigation(void)
+{
+ if (cpu_mitigations_off() ||
+ !boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+ !boot_cpu_has(X86_FEATURE_IBPB)) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ return;
+ }
+
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO)
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB ||
+ srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT)
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT;
+
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+ setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
+
+ pr_info("%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) fmt
+#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n"
+
+void cpu_bugs_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_FULL:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ break;
+ }
+
+ switch (vmscape_mitigation) {
+ case VMSCAPE_MITIGATION_NONE:
+ case VMSCAPE_MITIGATION_AUTO:
+ break;
+ case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+ case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
+ /*
+ * Hypervisors can be attacked across-threads, warn for SMT when
+ * STIBP is not already enabled system-wide.
+ *
+ * Intel eIBRS (!AUTOIBRS) implies STIBP on.
+ */
+ if (!sched_smt_active() ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ break;
+ pr_warn_once(VMSCAPE_MSG_SMT);
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
@@ -2774,6 +3098,11 @@ static ssize_t rfds_show_state(char *buf)
return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
}
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2900,6 +3229,16 @@ static ssize_t srso_show_state(char *buf)
boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode");
}
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static ssize_t vmscape_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -2958,6 +3297,15 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_RFDS:
return rfds_show_state(buf);
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
+ case X86_BUG_VMSCAPE:
+ return vmscape_show_state(buf);
+
default:
break;
}
@@ -3037,4 +3385,19 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib
{
return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
+
+ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 62aaabea7e33..19c9087e2b84 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1045,17 +1045,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
- /* AMD-defined flags: level 0x80000001 */
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
eax = cpuid_eax(0x80000000);
- c->extended_cpuid_level = eax;
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
- if ((eax & 0xffff0000) == 0x80000000) {
- if (eax >= 0x80000001) {
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_8000_0001_ECX] = ecx;
- c->x86_capability[CPUID_8000_0001_EDX] = edx;
- }
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
if (c->extended_cpuid_level >= 0x80000007) {
@@ -1251,39 +1252,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define GDS BIT(6)
/* CPU is affected by Register File Data Sampling */
#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
+/* CPU is affected by VMSCAPE */
+#define VMSCAPE BIT(11)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(SANDYBRIDGE_X, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SANDYBRIDGE, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE_X, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x0, 0x5), MMIO | RETBLEED | GDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xb), MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xc), MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_N, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(METEORLAKE_L, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(ARROWLAKE_H, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(ARROWLAKE, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(LUNARLAKE_M, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(SAPPHIRERAPIDS_X,X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPING_ANY, VMSCAPE),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE_N, X86_STEPPING_ANY, RFDS | VMSCAPE),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
@@ -1293,9 +1315,9 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
- VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
- VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE),
{}
};
@@ -1341,6 +1363,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
return cpu_matches(cpu_vuln_blacklist, RFDS);
}
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
@@ -1455,9 +1503,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (vulnerable_to_rfds(x86_arch_cap_msr))
setup_force_cpu_bug(X86_BUG_RFDS);
- /* When virtualized, eIBRS could be hidden, assume vulnerable */
- if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
- !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
(boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
boot_cpu_has(X86_FEATURE_HYPERVISOR)))
setup_force_cpu_bug(X86_BUG_BHI);
@@ -1465,6 +1516,30 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
+ /*
+ * Set the bug only on bare-metal. A nested hypervisor should already be
+ * deploying IBPB to isolate itself from nested guests.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) &&
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 8a80d5343f3a..4f69e85bc255 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -14,6 +14,7 @@
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
+#include <asm/resctrl.h>
#include "cpu.h"
@@ -239,6 +240,8 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
}
}
+
+ resctrl_cpu_detect(c);
}
static void early_init_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 991f38f57caf..9939b984bceb 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -333,7 +333,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
struct thresh_restart {
struct threshold_block *b;
- int reset;
int set_lvt_off;
int lvt_off;
u16 old_limit;
@@ -428,13 +427,13 @@ static void threshold_restart_bank(void *_tr)
rdmsr(tr->b->address, lo, hi);
- if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- hi =
- (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
@@ -1053,13 +1052,20 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
}
bank_type = smca_get_bank_type(cpu, bank);
- if (bank_type >= N_SMCA_BANK_TYPES)
- return NULL;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
- return NULL;
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
}
if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 359218bc1b34..e65dfc88b925 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2707,15 +2707,9 @@ static int mce_cpu_dead(unsigned int cpu)
static int mce_cpu_online(unsigned int cpu)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- int ret;
mce_device_create(cpu);
-
- ret = mce_threshold_create_device(cpu);
- if (ret) {
- mce_device_remove(cpu);
- return ret;
- }
+ mce_threshold_create_device(cpu);
mce_reenable_cpu();
mce_start_timer(t);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..7d0fb5e0b32f 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -500,6 +500,7 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c)
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
+ cmci_clear();
}
bool intel_filter_mce(struct mce *m)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 558108296f3c..31549e7f6b7c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (boot_cpu_has(X86_FEATURE_MTRR))
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 28c357cf7c75..0d019e6972be 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,9 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index c4960b8e5195..b86eb601827b 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -718,6 +718,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ec51ce713dea..48cf91625a16 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -360,7 +360,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
goto fail;
ip = trampoline + size;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(ip))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
memcpy(ip, retq, sizeof(retq));
@@ -415,8 +415,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
- set_vm_flush_reset_perms(trampoline);
-
if (likely(system_state != SYSTEM_BOOTING))
set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 80e262bb627f..cb9852ad6098 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -46,7 +46,8 @@ bool __init pit_timer_init(void)
* VMMs otherwise steal CPU time just to pointlessly waggle
* the (masked) IRQ.
*/
- clockevent_i8253_disable();
+ scoped_guard(irq)
+ clockevent_i8253_disable();
return false;
}
clockevent_i8253_init(true);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e2fab3ceb09f..9a101150376d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk)
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
{
+ struct task_struct *tsk = current;
struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk)
struct io_bitmap *iobm = tsk->thread.io_bitmap;
tsk->thread.io_bitmap = NULL;
- task_update_io_bitmap(tsk);
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
}
t->iopl_emul = level;
- task_update_io_bitmap(current);
-
+ task_update_io_bitmap();
return 0;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 991f00c817e6..180c708879d2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -427,7 +427,6 @@ void *alloc_insn_page(void)
if (!page)
return NULL;
- set_vm_flush_reset_perms(page);
/*
* First make the page read-only, and only then make it executable to
* prevent it from being W+X in between.
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 5d61a342871b..24b6eaacc81e 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -149,8 +149,7 @@ static void free_transition_pgtable(struct kimage *image)
image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
- unsigned long control_page)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
@@ -161,7 +160,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
pte_t *pte;
vaddr = (unsigned long)relocate_kernel;
- paddr = control_page;
+ paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -220,7 +219,7 @@ static void *alloc_pgt_page(void *data)
return p;
}
-static int init_pgtable(struct kimage *image, unsigned long control_page)
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
@@ -229,12 +228,12 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
+ pgd_t *level4p;
int result;
int i;
- image->arch.pgd = alloc_pgt_page(image);
- if (!image->arch.pgd)
- return -ENOMEM;
+ level4p = (pgd_t *)__va(start_pgtable);
+ clear_page(level4p);
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
@@ -248,8 +247,8 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
- result = kernel_ident_mapping_init(&info, image->arch.pgd,
- mstart, mend);
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
if (result)
return result;
}
@@ -264,8 +263,8 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
- result = kernel_ident_mapping_init(&info, image->arch.pgd,
- mstart, mend);
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
if (result)
return result;
@@ -275,19 +274,15 @@ static int init_pgtable(struct kimage *image, unsigned long control_page)
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
- result = map_efi_systab(&info, image->arch.pgd);
+ result = map_efi_systab(&info, level4p);
if (result)
return result;
- result = map_acpi_tables(&info, image->arch.pgd);
+ result = map_acpi_tables(&info, level4p);
if (result)
return result;
- /*
- * This must be last because the intermediate page table pages it
- * allocates will not be control pages and may overlap the image.
- */
- return init_transition_pgtable(image, image->arch.pgd, control_page);
+ return init_transition_pgtable(image, level4p);
}
static void load_segments(void)
@@ -304,14 +299,14 @@ static void load_segments(void)
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long control_page;
+ unsigned long start_pgtable;
int result;
/* Calculate the offsets */
- control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, control_page);
+ result = init_pgtable(image, start_pgtable);
if (result)
return result;
@@ -358,12 +353,13 @@ void machine_kexec(struct kimage *image)
#endif
}
- control_page = page_address(image->control_code_page);
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd);
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
if (image->type == KEXEC_TYPE_DEFAULT)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
@@ -582,7 +578,8 @@ static void kexec_mark_crashkres(bool protect)
/* Don't touch the control code page used in crash_kexec().*/
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
- kexec_mark_range(crashk_res.start, control - 1, protect);
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
control += KEXEC_CONTROL_PAGE_SIZE;
kexec_mark_range(control, crashk_res.end, protect);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index c032edcd3d95..c34ea5e028c4 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -74,10 +74,11 @@ void *module_alloc(unsigned long size)
return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask,
- PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
- __builtin_return_address(0));
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask, PAGE_KERNEL,
+ VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
@@ -285,10 +286,16 @@ int module_finalize(const Elf_Ehdr *hdr,
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
+
+ its_init_mod(me);
+
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
apply_retpolines(rseg, rseg + retpolines->sh_size);
}
+
+ its_fini_mod(me);
+
if (returns) {
void *rseg = (void *)returns->sh_addr;
apply_returns(rseg, rseg + returns->sh_size);
@@ -320,4 +327,5 @@ int module_finalize(const Elf_Ehdr *hdr,
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
+ its_free_mod(mod);
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ed6cce6c3950..b9a128546970 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -38,8 +38,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
+/*
+ * An emergency handler can be set in any context including NMI
+ */
struct nmi_desc {
raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
struct list_head head;
};
@@ -121,9 +125,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
struct nmiaction *a;
int handled=0;
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
rcu_read_lock();
/*
@@ -213,6 +230,31 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 24c237c57198..1138b999cb14 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -154,6 +154,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -428,6 +429,11 @@ void native_tss_update_io_bitmap(void)
} else {
struct io_bitmap *iobm = t->io_bitmap;
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
/*
* Only copy bitmap data when the sequence number differs. The
* update time is accounted to the incoming task.
@@ -881,6 +887,11 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
*/
static __cpuidle void mwait_idle(void)
{
+ if (need_resched())
+ return;
+
+ x86_idle_clear_cpu_buffers();
+
if (!current_set_polling_and_test()) {
if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
mb(); /* quirk */
@@ -889,13 +900,17 @@ static __cpuidle void mwait_idle(void)
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
- __sti_mwait(0, 0);
- else
+ if (need_resched()) {
raw_local_irq_enable();
+ goto out;
+ }
+
+ __sti_mwait(0, 0);
} else {
raw_local_irq_enable();
}
+
+out:
__current_clr_polling();
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 299b970e5f82..79e1ac3d0625 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -787,26 +787,27 @@ void machine_crash_shutdown(struct pt_regs *regs)
}
#endif
-/*
- * This is used to VMCLEAR all VMCSs loaded on the
- * processor. And when loading kvm_intel module, the
- * callback function pointer will be assigned.
- *
- * protected by rcu.
- */
-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
-static inline void cpu_crash_vmclear_loaded_vmcss(void)
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
{
- crash_vmclear_fn *do_vmclear_operation = NULL;
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
- rcu_read_lock();
- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
- if (do_vmclear_operation)
- do_vmclear_operation();
- rcu_read_unlock();
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
}
+EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
@@ -818,9 +819,15 @@ int crashing_cpu = -1;
*/
void cpu_emergency_disable_virtualization(void)
{
- cpu_crash_vmclear_loaded_vmcss();
+ cpu_emergency_virt_cb *callback;
- cpu_emergency_vmxoff();
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+
+ /* KVM_AMD doesn't yet utilize the common callback. */
cpu_emergency_svm_disable();
}
@@ -896,15 +903,11 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
- NMI_FLAG_FIRST, "crash"))
- return; /* Return what? */
+
/*
- * Ensure the new callback function is set before sending
- * out the NMI
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
apic_send_IPI_allbutself(NMI_VECTOR);
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 3fe76bf17d95..e658e83c62ae 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -1064,3 +1064,21 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
}
}
+
+static inline void sev_evict_cache(void *va, int npages)
+{
+ volatile u8 val __always_unused;
+ u8 *bytes = va;
+ int page_idx;
+
+ /*
+ * For SEV guests, a read from the first/last cache-lines of a 4K page
+ * using the guest key is sufficient to cause a flush of all cache-lines
+ * associated with that 4K page without incurring all the overhead of a
+ * full CLFLUSH sequence.
+ */
+ for (page_idx = 0; page_idx < npages; page_idx++) {
+ val = bytes[page_idx * PAGE_SIZE];
+ val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1];
+ }
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index f8a8249ae117..7b7fa85d1547 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -676,10 +676,12 @@ static u64 __init get_jump_table_addr(void)
static void pvalidate_pages(unsigned long vaddr, unsigned long npages, bool validate)
{
- unsigned long vaddr_end;
+ unsigned long vaddr_begin, vaddr_end;
int rc;
vaddr = vaddr & PAGE_MASK;
+
+ vaddr_begin = vaddr;
vaddr_end = vaddr + (npages << PAGE_SHIFT);
while (vaddr < vaddr_end) {
@@ -689,6 +691,13 @@ static void pvalidate_pages(unsigned long vaddr, unsigned long npages, bool vali
vaddr = vaddr + PAGE_SIZE;
}
+
+ /*
+ * If validating memory (making it private) and affected by the
+ * cache-coherency vulnerability, perform the cache eviction mitigation.
+ */
+ if (validate && !cpu_feature_enabled(X86_FEATURE_COHERENCY_SFW_NO))
+ sev_evict_cache((void *)vaddr_begin, npages);
}
static void __head early_set_pages_state(unsigned long paddr, unsigned long npages, enum psc_op op)
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca8dfc0c9edb..93809a4dc793 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -79,7 +79,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
break;
case RET:
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ if (cpu_wants_rethunk_at(insn))
code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
else
code = &retinsn;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78ccb5ec3c0e..35aeec9d2267 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -528,6 +528,16 @@ INIT_PER_CPU(irq_stack_backing_store);
"SRSO function pair won't alias");
#endif
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
+#endif
+
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
+
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 43b1df7ec183..1bb5e8f6c63e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -758,6 +758,12 @@ void kvm_set_cpu_caps(void)
if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+ kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(VERW_CLEAR));
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0021_ECX,
+ F(TSA_SQ_NO) | F(TSA_L1_NO)
+ );
+
/*
* Hide RDTSCP and RDPID if either feature is reported as supported but
* probing MSR_TSC_AUX failed. This is purely a sanity check and
@@ -1243,7 +1249,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ entry->ebx = entry->edx = 0;
/*
* Pass down these bits:
* EAX 0 NNDBP, Processor ignores nested data breakpoints
@@ -1259,6 +1265,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax |= BIT(2);
if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
entry->eax |= BIT(6);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 28555bbd52e8..cb0a531e13c5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_set_msr(vcpu, msr, data, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
return 0;
@@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
@@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_get_msr(vcpu, msr, pdata, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
@@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = APIC_BUS_FREQUENCY;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
*pdata = data;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 7f57dce5c828..41d8474ce555 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -587,7 +587,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -632,7 +632,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -640,6 +640,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
}
}
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
+ return;
+
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr);
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
/* This may race with setting of irr in __apic_accept_irq() and
@@ -673,6 +684,8 @@ static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
if (min > map->max_apic_id)
return 0;
+ min = array_index_nospec(min, map->max_apic_id + 1);
+
for_each_set_bit(i, ipi_bitmap,
min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
if (map->phys_map[min + i]) {
@@ -2315,11 +2328,25 @@ int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
data &= ~APIC_ICR_BUSY;
kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
trace_kvm_apic_write(APIC_ICR, data);
return 0;
}
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
@@ -2337,7 +2364,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
* maybe-unecessary write, and both are in the noise anyways.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR)
- WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)));
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
else
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
@@ -2540,7 +2567,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2760,18 +2787,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
/*
* In x2APIC mode, the LDR is fixed and based on the id. And
- * ICR is internally a single 64-bit register, but needs to be
- * split to ICR+ICR2 in userspace for backwards compatibility.
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
*/
- if (set) {
+ if (set)
*ldr = kvm_apic_calc_x2apic_ldr(*id);
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
- } else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
}
@@ -2829,7 +2860,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -2971,7 +3002,7 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
u32 low;
if (reg == APIC_ICR) {
- *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ *data = kvm_x2apic_icr_read(apic);
return 0;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a5ac4a5a5179..e5d2dc58fcf8 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -122,6 +122,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index e43909d6504a..7fbd24fbf363 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -14,6 +14,7 @@
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
CPUID_7_2_EDX,
+ CPUID_8000_0021_ECX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -45,6 +46,10 @@ enum kvm_only_cpuid_leafs {
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -71,6 +76,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
};
/*
@@ -107,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
KVM_X86_TRANSLATE_FEATURE(SGX2);
KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
default:
return x86_feature;
}
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index fb125b54ee68..0adbf0677b7c 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -839,7 +839,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -915,6 +915,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
+ bool enable_remapped_mode = true;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
@@ -952,6 +953,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_vcpu_apicv_active(&svm->vcpu)) {
struct amd_iommu_pi_data pi;
+ enable_remapped_mode = false;
+
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
@@ -970,33 +973,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
*/
if (!ret && pi.is_guest_mode)
svm_ir_list_add(svm, &pi);
- } else {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
-
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
-
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
}
if (!ret && svm) {
@@ -1012,6 +988,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
ret = 0;
+ if (enable_remapped_mode) {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index d8e192ad5953..0e0bc2c46b51 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1755,6 +1755,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
struct kvm_vcpu *src_vcpu;
unsigned long i;
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
if (!sev_es_guest(src))
return 0;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d7d1b0f7073f..5a6bd9d5cceb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1426,7 +1426,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
{
int i;
- for_each_online_cpu(i)
+ for_each_possible_cpu(i)
cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
}
@@ -1920,11 +1920,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -3035,8 +3035,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_DEBUGCTLMSR:
if (!lbrv) {
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
@@ -3077,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
@@ -3965,6 +3964,9 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
/*
* Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
* can't read guest memory (dereference memslots) to decode the WRMSR.
@@ -3982,6 +3984,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ /*
+ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
+ * VMRUN controls whether or not physical IRQs are masked (KVM always
+ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
+ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
+ * into guest state if delivery of an event during VMRUN triggers a
+ * #VMEXIT, and the guest_state transitions already tell lockdep that
+ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
+ * this path, so IRQs aren't actually unmasked while running host code.
+ */
+ raw_local_irq_enable();
+
amd_clear_divider();
if (sev_es_guest(vcpu->kvm))
@@ -3989,15 +4003,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);
+ raw_local_irq_disable();
+
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4016,9 +4033,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is enough to force an immediate vmexit.
*/
disable_nmi_singlestep(svm);
- smp_send_reschedule(vcpu->cpu);
+ force_immediate_exit = true;
}
+ if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -4032,13 +4052,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
- * Run with all-zero DR6 unless needed, so that we can get the exact cause
- * of a #DB.
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -4115,9 +4136,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
return svm_exit_handlers_fastpath(vcpu);
}
@@ -4838,6 +4856,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+
+ .x2apic_icr_is_split = true,
.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
@@ -4859,8 +4879,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .request_immediate_exit = __kvm_request_immediate_exit,
-
.sched_in = svm_sched_in,
.nested_ops = &svm_nested_ops,
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 5be9a63f09ff..48b72625cc45 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -166,13 +166,12 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
- /* Enter guest mode */
- sti
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+ /* Enter guest mode */
3: vmrun %_ASM_AX
4:
- cli
-
/* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX
@@ -336,12 +335,12 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
- /* Enter guest mode */
- sti
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+ /* Enter guest mode */
1: vmrun %_ASM_AX
-
-2: cli
+2:
/* Pop @svm to RDI, guest registers have been saved already. */
pop %_ASM_DI
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6c1dcf44c4fa..ab407bc00d84 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,20 +15,23 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(struct kvm_vcpu *vcpu),
- TP_ARGS(vcpu),
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
+ __field( bool, immediate_exit )
),
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
),
- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ __entry->immediate_exit ? "[immediate exit]" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8052f8b7d8e1..2c3cf4351c4c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2532,10 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
@@ -3402,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
@@ -4564,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
@@ -4619,6 +4627,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
* handle a variety of side effects to KVM's software model.
@@ -4839,6 +4850,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
+ if (vmx->nested.update_vmcs01_hwapic_isr) {
+ vmx->nested.update_vmcs01_hwapic_isr = false;
+ kvm_apic_update_hwapic_isr(vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 220cdbe1e286..76d3ed8abf6a 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
}
}
@@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
goto warn;
if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
goto warn;
@@ -769,7 +769,7 @@ warn:
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
{
- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..0d04382f37af 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -272,6 +272,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
+ bool enable_remapped_mode = true;
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
@@ -310,21 +311,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
+ !kvm_irq_is_postable(&irq))
continue;
- }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
@@ -332,11 +320,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (!set)
+ continue;
+ enable_remapped_mode = false;
+
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
__func__);
@@ -344,6 +333,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
}
+ if (enable_remapped_mode)
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 460ba48eb66c..22e4c9bbbcb4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -49,6 +49,8 @@
#include <asm/virtext.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
@@ -705,14 +707,19 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
return ret;
}
-static void crash_vmclear_local_loaded_vmcss(void)
+static void vmx_emergency_disable(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
+
+ __cpu_emergency_vmxoff();
}
static void __loaded_vmcs_clear(void *arg)
@@ -1223,8 +1230,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -1418,13 +1423,9 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2031,7 +2032,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2056,7 +2057,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
@@ -2068,9 +2069,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2139,31 +2156,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -4749,7 +4757,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -5929,22 +5938,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -6702,11 +6735,27 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
put_page(page);
}
-static void vmx_hwapic_isr_update(int max_isr)
+static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ */
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -7051,13 +7100,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7110,13 +7159,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
barrier_nospec();
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7138,7 +7196,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mmio_stale_data_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
- mds_clear_cpu_buffers();
+ x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7155,8 +7213,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7182,7 +7241,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -7201,6 +7260,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7220,10 +7285,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7241,7 +7302,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
@@ -7257,8 +7320,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -7315,10 +7378,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
@@ -7825,11 +7885,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -8150,6 +8205,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
+ .HOST_OWNED_DEBUGCTL = DEBUGCTLMSR_FREEZE_IN_SMM,
+
.update_exception_bitmap = vmx_update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
@@ -8199,6 +8256,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.enable_nmi_window = vmx_enable_nmi_window,
.enable_irq_window = vmx_enable_irq_window,
.update_cr8_intercept = vmx_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -8232,8 +8291,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .request_immediate_exit = vmx_request_immediate_exit,
-
.sched_in = vmx_sched_in,
.cpu_dirty_log_size = PML_ENTITY_NUM,
@@ -8490,7 +8547,6 @@ static __init int hardware_setup(void)
if (!enable_preemption_timer) {
vmx_x86_ops.set_hv_timer = NULL;
vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8551,8 +8607,7 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
+ cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
vmx_cleanup_l1d_flush();
}
@@ -8626,8 +8681,7 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
+ cpu_emergency_register_virt_callback(vmx_emergency_disable);
vmx_check_vmcs12_offsets();
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9e0bb98b116d..dc6f06326648 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -189,6 +189,7 @@ struct nested_vmx {
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
bool update_vmcs01_apicv_status;
+ bool update_vmcs01_hwapic_isr;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -342,8 +343,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- bool req_immediate_exit;
-
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
@@ -351,8 +350,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- unsigned long host_debugctlmsr;
-
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
@@ -445,6 +442,32 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
+{
+ WARN_ON_ONCE(val & DEBUGCTLMSR_FREEZE_IN_SMM);
+
+ val |= vcpu->arch.host_debugctl & DEBUGCTLMSR_FREEZE_IN_SMM;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
+}
+
+static inline u64 vmx_guest_debugctl_read(void)
+{
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~DEBUGCTLMSR_FREEZE_IN_SMM;
+}
+
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & DEBUGCTLMSR_FREEZE_IN_SMM))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~DEBUGCTLMSR_FREEZE_IN_SMM);
+}
+
/*
* Note, early Intel manuals have the write-low and read-high bitmap offsets
* the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4d5be4956c48..11ca05d830e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1614,7 +1614,7 @@ static unsigned int num_msr_based_features;
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1653,6 +1653,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_MDS_NO;
if (!boot_cpu_has_bug(X86_BUG_RFDS))
data |= ARCH_CAP_RFDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ data |= ARCH_CAP_ITS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
@@ -3571,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -3623,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == BIT_ULL(18)) {
vcpu->arch.msr_hwcr = data;
} else if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
@@ -3811,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true;
- fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (pr || data != 0)
- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_K7_CLK_CTL:
/*
@@ -3847,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
@@ -9687,8 +9681,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
- if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
- target = map->phys_map[dest_id]->vcpu;
+ if (likely(map) && dest_id <= map->max_apic_id) {
+ dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
+ if (map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+ }
rcu_read_unlock();
@@ -10584,12 +10581,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10603,6 +10594,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10823,9 +10815,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
+ run_flags = 0;
if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
}
fpregs_assert_state_consistent();
@@ -10841,10 +10834,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
+
guest_timing_enter_irqoff();
for (;;) {
@@ -10857,7 +10865,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10869,6 +10877,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
@@ -10916,6 +10926,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XFD_ERR, 0);
/*
+ * Mark this CPU as needing a branch predictor flush before running
+ * userspace. Must be done before enabling preemption to ensure it gets
+ * set for the CPU that actually ran the guest, and not the CPU that it
+ * may migrate to.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
+ this_cpu_write(x86_ibpb_exit_to_user, true);
+
+ /*
* Consume any pending interrupts, including the possible source of
* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
* An instruction is required after local_irq_enable() to fully unblock
@@ -13380,16 +13399,22 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
int ret;
- irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
-
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+
return ret;
}
@@ -13399,9 +13424,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int ret;
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
WARN_ON(irqfd->producer != prod);
- irqfd->producer = NULL;
/*
* When producer of consumer is unregistered, we change back to
@@ -13409,11 +13434,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = NULL;
+
+
ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+
kvm_arch_end_assignment(irqfd->kvm);
}
@@ -13426,7 +13458,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
- if (new->type != KVM_IRQ_ROUTING_MSI)
+ if (old->type != KVM_IRQ_ROUTING_MSI ||
+ new->type != KVM_IRQ_ROUTING_MSI)
return true;
return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de72586f406..f3554bf05201 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
extern bool eager_page_split;
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 684a39df60d9..8e38f976feb4 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1536,8 +1536,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
- if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
- return -EINVAL;
+ /*
+ * Don't check for the port being within range of max_evtchn_port().
+ * Userspace can configure what ever targets it likes; events just won't
+ * be delivered if/while the target is invalid, just like userspace can
+ * configure MSIs which target non-existent APICs.
+ *
+ * This allow on Live Migration and Live Update, the IRQ routing table
+ * can be restored *independently* of other things like creating vCPUs,
+ * without imposing an ordering dependency on userspace. In this
+ * particular case, the problematic ordering would be with setting the
+ * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
+ * instead of 1024 event channels.
+ */
/* We only support 2 level event channels for now */
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 7880e2a7ec6a..e42661500094 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -258,6 +258,45 @@ SYM_FUNC_START(entry_untrain_ret)
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
+#ifdef CONFIG_MITIGATION_ITS
+
+.macro ITS_THUNK reg
+
+SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%\reg
+ int3
+ .align 32, 0xcc /* fill to the end of the line */
+ .skip 32, 0xcc /* skip to the next upper half */
+.endm
+
+/* ITS mitigation requires thunks be aligned to upper half of cacheline */
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(__x86_indirect_its_thunk_array)
+
+#define GEN(reg) ITS_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+ .align 64, 0xcc
+SYM_CODE_END(__x86_indirect_its_thunk_array)
+
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(its_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_CODE_END(its_return_thunk)
+EXPORT_SYMBOL(its_return_thunk)
+
+#endif /* CONFIG_MITIGATION_ITS */
+
SYM_CODE_START(__x86_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 60814e110a54..fcb8e7af5b4d 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -121,13 +121,12 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup,
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
-
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
fpu_reset_from_exception_fixup();
- return true;
+
+ return ex_handler_default(fixup, regs);
}
static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ab697ee64528..446bf7fbc325 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -654,8 +654,13 @@ static void __init memory_map_top_down(unsigned long map_start,
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end);
- memblock_phys_free(addr, PMD_SIZE);
- real_end = addr + PMD_SIZE;
+ if (!addr) {
+ pr_warn("Failed to release memory for alloc_low_pages()");
+ real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
+ } else {
+ memblock_phys_free(addr, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+ }
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 851711509d38..339bf2195486 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -224,6 +224,24 @@ static void sync_global_pgds(unsigned long start, unsigned long end)
}
/*
+ * Make kernel mappings visible in all page tables in the system.
+ * This is necessary except when the init task populates kernel mappings
+ * during the boot process. In that case, all processes originating from
+ * the init task copies the kernel mappings, so there is no issue.
+ * Otherwise, missing synchronization could lead to kernel crashes due
+ * to missing page table entries for certain kernel mappings.
+ *
+ * Synchronization is performed at the top level, which is the PGD in
+ * 5-level paging systems. But in 4-level paging systems, however,
+ * pgd_populate() is a no-op, so synchronization is done at the P4D level.
+ * sync_global_pgds() handles this difference between paging levels.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+ sync_global_pgds(start, end);
+}
+
+/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
*/
@@ -959,9 +977,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
- /* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
- nr_pages << PAGE_SHIFT);
+ /*
+ * Special case: add_pages() is called by memremap_pages() for adding device
+ * private pages. Do not bump up max_pfn in the device private path,
+ * because max_pfn changes affect dma_addressing_limited().
+ *
+ * dma_addressing_limited() returning true when max_pfn is the device's
+ * addressable memory can force device drivers to use bounce buffers
+ * and impact their performance negatively:
+ */
+ if (!params->pgmap)
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
return ret;
}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 230f1dee4f09..e0b0ec0f8245 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -109,8 +109,14 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt physical memory region size based on available memory */
- if (memory_tb < kaslr_regions[0].size_tb)
+ /*
+ * Adapt physical memory region size based on available memory,
+ * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the
+ * device BAR space assuming the direct map space is large enough
+ * for creating a ZONE_DEVICE mapping in the direct map corresponding
+ * to the physical BAR address.
+ */
+ if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb))
kaslr_regions[0].size_tb = memory_tb;
/*
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index b07e2167fceb..4918268d2007 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -385,9 +385,9 @@ static void cond_mitigation(struct task_struct *next)
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
/*
- * Avoid user/user BTB poisoning by flushing the branch predictor
- * when switching between processes. This stops one process from
- * doing Spectre-v2 attacks on another.
+ * Avoid user->user BTB/RSB poisoning by flushing them when switching
+ * between processes. This stops one process from doing Spectre-v2
+ * attacks on another.
*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
@@ -617,7 +617,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ /*
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
+ * and others are sensitive to the window where mm_cpumask(),
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
+ */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
}
@@ -880,8 +884,16 @@ done:
static bool should_flush_tlb(int cpu, void *data)
{
+ struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
struct flush_tlb_info *info = data;
+ /*
+ * Order the 'loaded_mm' and 'is_lazy' against their
+ * write ordering in switch_mm_irqs_off(). Ensure
+ * 'is_lazy' is at least as new as 'loaded_mm'.
+ */
+ smp_rmb();
+
/* Lazy TLB will get flushed at the next context switch. */
if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
return false;
@@ -890,8 +902,15 @@ static bool should_flush_tlb(int cpu, void *data)
if (!info->mm)
return true;
+ /*
+ * While switching, the remote CPU could have state from
+ * either the prev or next mm. Assume the worst and flush.
+ */
+ if (loaded_mm == LOADED_MM_SWITCHING)
+ return true;
+
/* The target mm is loaded, and the CPU is not lazy. */
- if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ if (loaded_mm == info->mm)
return true;
/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 92db785a0a8e..f3068bb53c4d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -36,6 +36,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+#define EMIT5(b1, b2, b3, b4, b5) \
+ do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0)
#define EMIT1_off32(b1, off) \
do { EMIT1(b1); EMIT(off, 4); } while (0)
@@ -462,7 +464,11 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{
u8 *prog = *pprog;
- if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_ITS) &&
+ cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) {
+ OPTIMIZER_HIDE_VAR(reg);
+ emit_jump(&prog, its_static_thunk(reg), ip);
+ } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
EMIT_LFENCE();
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
@@ -481,7 +487,7 @@ static void emit_return(u8 **pprog, u8 *ip)
{
u8 *prog = *pprog;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (cpu_wants_rethunk()) {
emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
@@ -947,6 +953,47 @@ static void emit_nops(u8 **pprog, int len)
#define RESTORE_TAIL_CALL_CNT(stack) \
EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip,
+ struct bpf_prog *bpf_prog)
+{
+ u8 *prog = *pprog;
+ u8 *func;
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) {
+ /* The clearing sequence clobbers eax and ecx. */
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x51); /* push rcx */
+ ip += 2;
+
+ func = (u8 *)clear_bhb_loop;
+
+ if (emit_call(&prog, func, ip))
+ return -EINVAL;
+ EMIT1(0x59); /* pop rcx */
+ EMIT1(0x58); /* pop rax */
+ }
+ /* Insert IBHF instruction */
+ if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) &&
+ cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) ||
+ cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) {
+ /*
+ * Add an Indirect Branch History Fence (IBHF). IBHF acts as a
+ * fence preventing branch history from before the fence from
+ * affecting indirect branches after the fence. This is
+ * specifically used in cBPF jitted code to prevent Intra-mode
+ * BHI attacks. The IBHF instruction is designed to be a NOP on
+ * hardware that doesn't need or support it. The REP and REX.W
+ * prefixes are required by the microcode, and they also ensure
+ * that the NOP is unlikely to be used in existing code.
+ *
+ * IBHF is not a valid instruction in 32-bit mode.
+ */
+ EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */
+ }
+ *pprog = prog;
+ return 0;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
@@ -1760,6 +1807,15 @@ emit_jmp:
seen_exit = true;
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
+
+ if (bpf_prog_was_classic(bpf_prog) &&
+ !capable(CAP_SYS_ADMIN)) {
+ u8 *ip = image + addrs[i - 1];
+
+ if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
+ return -EINVAL;
+ }
+
pop_callee_regs(&prog, callee_regs_used);
EMIT1(0xC9); /* leave */
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 472540aeabc2..08cd913cbd4e 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -10,8 +10,7 @@
#include <assert.h>
#include <unistd.h>
#include <stdarg.h>
-
-#define unlikely(cond) (cond)
+#include <linux/kallsyms.h>
#include <asm/insn.h>
#include <inat.c>
@@ -106,7 +105,7 @@ static void parse_args(int argc, char **argv)
}
}
-#define BUFSIZE 256
+#define BUFSIZE (256 + KSYM_NAME_LEN)
int main(int argc, char **argv)
{
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index b07824500363..ddc144657efa 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -20,6 +20,9 @@
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+/* Do not call this directly. Declared for export type visibility. */
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+
/**
* csum_fold - Fold and invert a 32bit checksum.
* sum: 32bit unfolded sum
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index 49c3744cac37..81b9d1f9f4e6 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -26,7 +26,6 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
COPY(RIP);
COPY2(EFLAGS, EFL);
COPY2(CS, CSGSFS);
- regs->gp[CS / sizeof(unsigned long)] &= 0xffff;
- regs->gp[CS / sizeof(unsigned long)] |= 3;
+ regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48;
#endif
}