summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/boot/compressed/head_64.S10
-rw-r--r--arch/x86/boot/compressed/pgtable.h2
-rw-r--r--arch/x86/entry/entry_64_compat.S6
-rw-r--r--arch/x86/events/intel/core.c16
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/mmu_context.h18
-rw-r--r--arch/x86/include/asm/page_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/resctrl_sched.h4
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/bugs.c4
-rw-r--r--arch/x86/kernel/cpu/mce/core.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile4
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c5
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/tsc.c30
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/hyperv.c7
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/kvm/svm.c34
-rw-r--r--arch/x86/kvm/trace.h2
-rw-r--r--arch/x86/kvm/vmx/evmcs.c7
-rw-r--r--arch/x86/kvm/vmx/nested.c18
-rw-r--r--arch/x86/kvm/vmx/vmx.c154
-rw-r--r--arch/x86/kvm/x86.c15
-rw-r--r--arch/x86/lib/iomem.c33
-rw-r--r--arch/x86/lib/kaslr.c4
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c4
-rw-r--r--arch/x86/mm/pageattr.c50
-rw-r--r--arch/x86/xen/enlighten_pv.c5
-rw-r--r--arch/x86/xen/time.c12
39 files changed, 299 insertions, 196 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6185d4f33296..68261430fe6e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -198,7 +198,7 @@ config X86
select IRQ_FORCED_THREADING
select NEED_SG_DMA_LENGTH
select PCI_DOMAINS if PCI
- select PCI_LOCKLESS_CONFIG
+ select PCI_LOCKLESS_CONFIG if PCI
select PERF_EVENTS
select RTC_LIB
select RTC_MC146818_LIB
@@ -446,12 +446,12 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
-config RESCTRL
- bool "Resource Control support"
+config X86_CPU_RESCTRL
+ bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
help
- Enable Resource Control support.
+ Enable x86 CPU resource control support.
Provide support for the allocation and monitoring of system resources
usage by the CPU.
@@ -617,7 +617,7 @@ config X86_INTEL_QUARK
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
- depends on X86 && ACPI
+ depends on X86 && ACPI && PCI
select COMMON_CLK
select PINCTRL
select IOSF_MBI
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 64037895b085..f62e347862cc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -600,6 +600,16 @@ ENTRY(trampoline_32bit_src)
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
movl %eax, %cr3
3:
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ pushl %ecx
+ pushl %edx
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+ popl %edx
+ popl %ecx
+
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
cmpl $0, %edx
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 91f75638f6e6..6ff7e81b5628 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x60
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 8eaf8952c408..39913770a44d 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -361,7 +361,8 @@ ENTRY(entry_INT80_compat)
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
+ /* In the Xen PV case we already run on the thread stack. */
+ ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -370,8 +371,9 @@ ENTRY(entry_INT80_compat)
pushq 3*8(%rdi) /* regs->cs */
pushq 2*8(%rdi) /* regs->ip */
pushq 1*8(%rdi) /* regs->orig_ax */
-
pushq (%rdi) /* pt_regs->di */
+.Lint80_keep_stack:
+
pushq %rsi /* pt_regs->si */
xorl %esi, %esi /* nospec si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 40e12cfc87f6..daafb893449b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3559,6 +3559,14 @@ static void free_excl_cntrs(int cpu)
static void intel_pmu_cpu_dying(int cpu)
{
+ fini_debug_store_on_cpu(cpu);
+
+ if (x86_pmu.counter_freezing)
+ disable_counter_freeze();
+}
+
+static void intel_pmu_cpu_dead(int cpu)
+{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct intel_shared_regs *pc;
@@ -3570,11 +3578,6 @@ static void intel_pmu_cpu_dying(int cpu)
}
free_excl_cntrs(cpu);
-
- fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3663,6 +3666,7 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
};
static struct attribute *intel_pmu_attrs[];
@@ -3703,6 +3707,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c07bee31abe8..b10e04387f38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1222,6 +1222,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
.id_table = snbep_uncore_pci_ids,
};
+#define NODE_ID_MASK 0x7
+
/*
* build pci bus to socket mapping
*/
@@ -1243,7 +1245,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
if (err)
break;
- nodeid = config;
+ nodeid = config & NODE_ID_MASK;
/* get the Node ID mapping */
err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
if (err)
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 0dd6b0f4000e..d9a9993af882 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -6,7 +6,7 @@
* "Big Core" Processors (Branded as Core, Xeon, etc...)
*
* The "_X" parts are generally the EP and EX Xeons, or the
- * "Extreme" ones, like Broadwell-E.
+ * "Extreme" ones, like Broadwell-E, or Atom microserver.
*
* While adding a new CPUID for a new microarchitecture, add a new
* group to keep logically sorted out in chronological order. Within
@@ -71,6 +71,7 @@
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 0ca50611e8ce..19d18fae6ec6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -178,6 +178,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+/*
+ * Init a new mm. Used on mm copies, like at fork()
+ * and on mm's that are brand-new, like at execve().
+ */
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
@@ -228,8 +232,22 @@ do { \
} while (0)
#endif
+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+#endif
+}
+
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
+ arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 8f657286d599..0ce558a8150d 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -7,7 +7,11 @@
#endif
#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_EXTRA
+#define KASAN_STACK_ORDER 2
+#else
#define KASAN_STACK_ORDER 1
+#endif
#else
#define KASAN_STACK_ORDER 0
#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..2779ace16d23 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1065,7 +1065,7 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- native_set_pmd(pmdp, pmd);
+ set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl_sched.h
index 54990fe2a3ae..f6b7fe2833cc 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl_sched.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_RESCTRL_SCHED_H
#define _ASM_X86_RESCTRL_SCHED_H
-#ifdef CONFIG_RESCTRL
+#ifdef CONFIG_X86_CPU_RESCTRL
#include <linux/sched.h>
#include <linux/jump_label.h>
@@ -88,6 +88,6 @@ static inline void resctrl_sched_in(void)
static inline void resctrl_sched_in(void) {}
-#endif /* CONFIG_RESCTRL */
+#endif /* CONFIG_X86_CPU_RESCTRL */
#endif /* _ASM_X86_RESCTRL_SCHED_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a77445d1b034..780f2b42c8ef 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -711,7 +711,7 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t
{
if (unlikely(!access_ok(ptr,len)))
return 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
return 1;
}
#define user_access_begin(a,b) user_access_begin(a,b)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ac78f90aea56..cfd24f9f7614 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
-obj-$(CONFIG_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8654b8b0c848..01874d54f4fd 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -71,7 +71,7 @@ void __init check_bugs(void)
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_check_topology_early();
+ cpu_smt_check_topology();
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
@@ -215,7 +215,7 @@ static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
SPECTRE_V2_USER_NONE;
-#ifdef RETPOLINE
+#ifdef CONFIG_RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 672c7225cb1b..6ce290c506d9 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -784,6 +784,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
quirk_no_way_out(i, m, regs);
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 51adde0a0f1a..e1f3ba19ba54 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -855,7 +855,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
if (!p) {
return ret;
} else {
- if (boot_cpu_data.microcode == p->patch_id)
+ if (boot_cpu_data.microcode >= p->patch_id)
return ret;
ret = UCODE_NEW;
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 6895049ceef7..4a06c37b9cf1 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c8b07d8ea5a2..17ffc869cab8 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -470,6 +470,7 @@ int crash_load_segments(struct kimage *image)
kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret) {
vfree((void *)image->arch.elf_headers);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b0acb22e5a46..dfd3aca82c61 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -21,10 +21,6 @@
#define HPET_MASK CLOCKSOURCE_MASK(32)
-/* FSEC = 10^-15
- NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
-
#define HPET_DEV_USED_BIT 2
#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
#define HPET_DEV_VALID 0x8
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 278cd07228dd..53917a3ebf94 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -167,6 +167,9 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
struct efi_info *current_ei = &boot_params.efi_info;
struct efi_info *ei = &params->efi_info;
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
if (!current_ei->efi_memmap_size)
return 0;
@@ -434,6 +437,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.memsz = PAGE_ALIGN(header->init_size);
kbuf.buf_align = header->kernel_alignment;
kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
@@ -448,6 +452,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz = kbuf.memsz = initrd_len;
kbuf.buf_align = PAGE_SIZE;
kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index ba4bfb7f6a36..5c93a65ee1e5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,6 +457,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
#else
u64 ipi_bitmap = 0;
#endif
+ long ret;
if (cpumask_empty(mask))
return;
@@ -482,8 +483,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
- kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
min = max = apic_id;
ipi_bitmap = 0;
}
@@ -491,8 +493,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
}
if (ipi_bitmap) {
- kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
}
local_irq_restore(flags);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e9f777bfed40..3fae23834069 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -297,15 +297,16 @@ static int __init tsc_setup(char *str)
__setup("tsc=", tsc_setup);
-#define MAX_RETRIES 5
-#define SMI_TRESHOLD 50000
+#define MAX_RETRIES 5
+#define TSC_DEFAULT_THRESHOLD 0x20000
/*
- * Read TSC and the reference counters. Take care of SMI disturbance
+ * Read TSC and the reference counters. Take care of any disturbances
*/
static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
+ u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
int i;
for (i = 0; i < MAX_RETRIES; i++) {
@@ -315,7 +316,7 @@ static u64 tsc_read_refs(u64 *p, int hpet)
else
*p = acpi_pm_read_early();
t2 = get_cycles();
- if ((t2 - t1) < SMI_TRESHOLD)
+ if ((t2 - t1) < thresh)
return t2;
}
return ULLONG_MAX;
@@ -703,15 +704,15 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
* zero. In each wait loop iteration we read the TSC and check
* the delta to the previous read. We keep track of the min
* and max values of that delta. The delta is mostly defined
- * by the IO time of the PIT access, so we can detect when a
- * SMI/SMM disturbance happened between the two reads. If the
+ * by the IO time of the PIT access, so we can detect when
+ * any disturbance happened between the two reads. If the
* maximum time is significantly larger than the minimum time,
* then we discard the result and have another try.
*
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for a SMI/SMM disturbance. We dicard
+ * reference read for any possible disturbance. We dicard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -744,7 +745,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
if (ref1 == ref2)
continue;
- /* Check, whether the sampling was disturbed by an SMI */
+ /* Check, whether the sampling was disturbed */
if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
continue;
@@ -1268,7 +1269,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
*/
static void tsc_refine_calibration_work(struct work_struct *work)
{
- static u64 tsc_start = -1, ref_start;
+ static u64 tsc_start = ULLONG_MAX, ref_start;
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
@@ -1283,14 +1284,15 @@ static void tsc_refine_calibration_work(struct work_struct *work)
* delayed the first time we expire. So set the workqueue
* again once we know timers are working.
*/
- if (tsc_start == -1) {
+ if (tsc_start == ULLONG_MAX) {
+restart:
/*
* Only set hpet once, to avoid mixing hardware
* if the hpet becomes enabled later.
*/
hpet = is_hpet_enabled();
- schedule_delayed_work(&tsc_irqwork, HZ);
tsc_start = tsc_read_refs(&ref_start, hpet);
+ schedule_delayed_work(&tsc_irqwork, HZ);
return;
}
@@ -1300,9 +1302,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
if (ref_start == ref_stop)
goto out;
- /* Check, whether the sampling was disturbed by an SMI */
- if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
- goto out;
+ /* Check, whether the sampling was disturbed */
+ if (tsc_stop == ULLONG_MAX)
+ goto restart;
delta = tsc_stop - tsc_start;
delta *= 1000000LL;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 69b3a7c30013..31ecf7a76d5a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,10 +2,6 @@
ccflags-y += -Iarch/x86/kvm
-CFLAGS_x86.o := -I.
-CFLAGS_svm.o := -I.
-CFLAGS_vmx.o := -I.
-
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c90a5352d158..89d20ed1d2e8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1636,7 +1636,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
if (ret != HV_STATUS_INVALID_PORT_ID)
break;
- /* maybe userspace knows this conn_id: fall through */
+ /* fall through - maybe userspace knows this conn_id. */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
@@ -1832,7 +1832,6 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE;
ent->eax |= HV_X64_MSR_RESET_AVAILABLE;
ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
- ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE;
ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS;
ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT;
@@ -1848,11 +1847,11 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_ENLIGHTMENT_INFO:
ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
- ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED;
ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
- ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
/*
* Default number of spinlock retry attempts, matches
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9f089e2e09d0..4b6c2da7265c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1035,6 +1035,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
+ /* fall through */
case APIC_DM_FIXED:
if (unlikely(trig_mode && !level))
break;
@@ -1874,6 +1875,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
+ /* fall through */
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ce770b446238..da9c42349b1f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4371,6 +4371,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
+ /* fall through */
case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 307e5bddb6d9..f13a3a24d360 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3414,6 +3414,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
return 0;
}
@@ -4395,7 +4403,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
- /* Follow through */
+ /* Fall through */
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -4504,28 +4512,19 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
struct kvm_lapic *apic = svm->vcpu.arch.apic;
/*
- * At this point, we expect that the AVIC HW has already
- * set the appropriate IRR bits on the valid target
- * vcpus. So, we just need to kick the appropriate vcpu.
+ * Update ICR high and low, then emulate sending IPI,
+ * which is handled when writing APIC_ICR.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & KVM_APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & KVM_APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
+ WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
+ index, svm->vcpu.vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -6278,6 +6277,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
int asid, ret;
ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
asid = sev_asid_new();
if (asid < 0)
return ret;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 705f40ae2532..6432d08c7de7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1465,7 +1465,7 @@ TRACE_EVENT(kvm_hv_send_ipi_ex,
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_PATH ../../arch/x86/kvm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 95bc2247478d..5466c6d85cf3 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -332,16 +332,17 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled;
+
+ vmx->nested.enlightened_vmcs_enabled = true;
if (vmcs_version)
*vmcs_version = nested_get_evmcs_version(vcpu);
/* We don't support disabling the feature for simplicity. */
- if (vmx->nested.enlightened_vmcs_enabled)
+ if (evmcs_already_enabled)
return 0;
- vmx->nested.enlightened_vmcs_enabled = true;
-
vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3170e291215d..d8ea4ebd79e7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -55,7 +55,7 @@ static u16 shadow_read_write_fields[] = {
static int max_shadow_read_write_fields =
ARRAY_SIZE(shadow_read_write_fields);
-void init_vmcs_shadow_fields(void)
+static void init_vmcs_shadow_fields(void)
{
int i, j;
@@ -211,6 +211,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
+ hrtimer_cancel(&vmx->nested.preemption_timer);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
@@ -4140,11 +4141,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;
- vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
- vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -4540,9 +4541,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* given physical address won't match the required
* VMCS12_REVISION identifier.
*/
- nested_vmx_failValid(vcpu,
+ return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
@@ -5264,13 +5264,17 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
copy_shadow_to_vmcs12(vmx);
}
- if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
- get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4d39f731bc33..95d618045001 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -26,6 +26,7 @@
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
@@ -423,7 +424,7 @@ static void check_ept_pointer_match(struct kvm *kvm)
to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
}
-int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
struct kvm_tlb_range *range = data;
@@ -453,7 +454,7 @@ static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
struct kvm_vcpu *vcpu;
- int ret = -ENOTSUPP, i;
+ int ret = 0, i;
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
@@ -1773,7 +1774,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- /* Otherwise falls through */
+ /* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
@@ -2014,7 +2015,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- /* Otherwise falls through */
+ /* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -2344,7 +2345,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
case 37: /* AAT100 */
case 44: /* BC86,AAY89,BD102 */
case 46: /* BA97 */
- _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
@@ -6362,72 +6363,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->hv_timer_armed = false;
}
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4, evmcs_rsp;
-
- /* Record the guest's net vcpu time for enforced NMI injections. */
- if (unlikely(!enable_vnmi &&
- vmx->loaded_vmcs->soft_vnmi_blocked))
- vmx->loaded_vmcs->entry_time = ktime_get();
-
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required)
- return;
-
- if (vmx->ple_window_dirty) {
- vmx->ple_window_dirty = false;
- vmcs_write32(PLE_WINDOW, vmx->ple_window);
- }
-
- if (vmx->nested.need_vmcs12_sync)
- nested_sync_from_vmcs12(vcpu);
-
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
- cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
- vmcs_writel(HOST_CR4, cr4);
- vmx->loaded_vmcs->host_state.cr4 = cr4;
- }
-
- /* When single-stepping over STI and MOV SS, we must clear the
- * corresponding interruptibility bits in the guest state. Otherwise
- * vmentry fails as it then expects bit 14 (BS) in pending debug
- * exceptions being set, but that's not correct for the guest debugging
- * case. */
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vmx_set_interrupt_shadow(vcpu, 0);
-
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
- vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vcpu->arch.pkru);
-
- pt_guest_enter(vmx);
-
- atomic_switch_perf_msrs(vmx);
-
- vmx_update_hv_timer(vcpu);
-
- /*
- * If this vCPU has touched SPEC_CTRL, restore the guest's value if
- * it's non-zero. Since vmentry is serialising on affected CPUs, there
- * is no need to worry about the conditional branch over the wrmsr
- * being speculatively taken.
- */
- x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+ unsigned long evmcs_rsp;
vmx->__launched = vmx->loaded_vmcs->launched;
@@ -6567,6 +6505,77 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
, "eax", "ebx", "edi"
#endif
);
+}
+STACK_FRAME_NON_STANDARD(__vmx_vcpu_run);
+
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required)
+ return;
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
+ if (vmx->nested.need_vmcs12_sync)
+ nested_sync_from_vmcs12(vcpu);
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
+
+ pt_guest_enter(vmx);
+
+ atomic_switch_perf_msrs(vmx);
+
+ vmx_update_hv_timer(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+
+ __vmx_vcpu_run(vcpu, vmx);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -6648,7 +6657,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
-STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static struct kvm *vmx_vm_alloc(void)
{
@@ -6816,7 +6824,7 @@ static int vmx_vm_init(struct kvm *kvm)
* Warn upon starting the first VM in a potentially
* insecure environment.
*/
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (sched_smt_active())
pr_warn_once(L1TF_MSG_SMT);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D);
@@ -7044,7 +7052,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
/* unmask address range configure area */
for (i = 0; i < vmx->pt_desc.addr_range; i++)
- vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+ vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02c8e095a239..e67ecf25e690 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3834,6 +3834,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
+ /* fall through */
+
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
@@ -5114,6 +5116,13 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
@@ -6480,8 +6489,7 @@ restart:
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE &&
- (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ if (r == EMULATE_DONE && ctxt->tf)
kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
@@ -7093,10 +7101,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
+#endif
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
-#endif
default:
ret = -KVM_ENOSYS;
break;
@@ -7937,6 +7945,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
+ /* fall through */
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index 66894675f3c8..df50451d94ef 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -2,8 +2,11 @@
#include <linux/module.h>
#include <linux/io.h>
+#define movs(type,to,from) \
+ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory")
+
/* Originally from i386/string.h */
-static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
+static __always_inline void rep_movs(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
@@ -21,13 +24,37 @@ static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
- __iomem_memcpy(to, (const void *)from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned source IO */
+ if (unlikely(1 & (unsigned long)from)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)from)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs(to, (const void *)from, n);
}
EXPORT_SYMBOL(memcpy_fromio);
void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
{
- __iomem_memcpy((void *)to, (const void *) from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned destination IO */
+ if (unlikely(1 & (unsigned long)to)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)to)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs((void *)to, (const void *) from, n);
}
EXPORT_SYMBOL(memcpy_toio);
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index 79778ab200e4..a53665116458 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -36,8 +36,8 @@ static inline u16 i8254(void)
u16 status, timer;
do {
- outb(I8254_PORT_CONTROL,
- I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+ outb(I8254_CMD_READBACK | I8254_SELECT_COUNTER0,
+ I8254_PORT_CONTROL);
status = inb(I8254_PORT_COUNTER0);
timer = inb(I8254_PORT_COUNTER0);
timer |= inb(I8254_PORT_COUNTER0) << 8;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad33233..9d5c75f02295 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -595,7 +595,7 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
return;
}
- addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+ addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
addr |= ((u64)desc.base3 << 32);
#endif
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index a19ef1a416ff..4aa9b1480866 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -158,8 +158,8 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
pmd = pmd_offset(pud, ppd->vaddr);
if (pmd_none(*pmd)) {
pte = ppd->pgtable_area;
- memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
- ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
+ memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4f8972311a77..14e6119838a6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
#endif
+/*
+ * See set_mce_nospec().
+ *
+ * Machine check recovery code needs to change cache mode of poisoned pages to
+ * UC to avoid speculative access logging another error. But passing the
+ * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
+ * speculative access. So we cheat and flip the top bit of the address. This
+ * works fine for the code that updates the page tables. But at the end of the
+ * process we need to flush the TLB and cache and the non-canonical address
+ * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long fix_addr(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
if (cpa->flags & CPA_PAGES_ARRAY) {
@@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data)
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
- __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+ __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
@@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range_opt((void *)addr, PAGE_SIZE);
+ clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
}
@@ -1627,29 +1650,6 @@ out:
return ret;
}
-/*
- * Machine check recovery code needs to change cache mode of poisoned
- * pages to UC to avoid speculative access logging another error. But
- * passing the address of the 1:1 mapping to set_memory_uc() is a fine
- * way to encourage a speculative access. So we cheat and flip the top
- * bit of the address. This works fine for the code that updates the
- * page tables. But at the end of the process we need to flush the cache
- * and the non-canonical address causes a #GP fault when used by the
- * CLFLUSH instruction.
- *
- * But in the common case we already have a canonical address. This code
- * will fix the top bit if needed and is a no-op otherwise.
- */
-static inline unsigned long make_addr_canonical_again(unsigned long addr)
-{
-#ifdef CONFIG_X86_64
- return (long)(addr << 1) >> 1;
-#else
- return addr;
-#endif
-}
-
-
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 2f6787fc7106..c54a493e139a 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -898,10 +898,7 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
-#ifdef CONFIG_X86_X2APIC
- if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
-#endif
- val &= ~X2APIC_ENABLE;
+ val &= ~X2APIC_ENABLE;
break;
}
return val;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 72bf446c3fee..6e29794573b7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -361,8 +361,6 @@ void xen_timer_resume(void)
{
int cpu;
- pvclock_resume();
-
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
@@ -379,12 +377,15 @@ static const struct pv_time_ops xen_time_ops __initconst = {
};
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+static u64 xen_clock_value_saved;
void xen_save_time_memory_area(void)
{
struct vcpu_register_time_memory_area t;
int ret;
+ xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
+
if (!xen_clock)
return;
@@ -404,7 +405,7 @@ void xen_restore_time_memory_area(void)
int ret;
if (!xen_clock)
- return;
+ goto out;
t.addr.v = &xen_clock->pvti;
@@ -421,6 +422,11 @@ void xen_restore_time_memory_area(void)
if (ret != 0)
pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
ret);
+
+out:
+ /* Need pvclock_resume() before using xen_clocksource_read(). */
+ pvclock_resume();
+ xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
}
static void xen_setup_vsyscall_time_info(void)