diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2018-08-06 21:56:34 +0300 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2018-08-06 21:56:34 +0300 |
commit | 315706049c343794ad0d3e5b6f6b60b900457b11 (patch) | |
tree | 0a1596a4b5843a6262d17b4c656715854784977d /arch/x86 | |
parent | 706d51681d636a0c4a5ef53395ec3b803e45ed4d (diff) | |
parent | c40a56a7818cfe735fc93a69e1875f8bba834483 (diff) | |
download | linux-315706049c343794ad0d3e5b6f6b60b900457b11.tar.xz |
Merge branch 'x86/pti-urgent' into x86/pti
Integrate the PTI Global bit fixes which conflict with the 32bit PTI
support.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86')
27 files changed, 254 insertions, 125 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f1dbb4ee19d7..887d3a7bb646 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_UACCESS_MCSAFE if X86_64 + select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..169c2feda14a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,9 +106,13 @@ define cmd_check_data_rel done endef +# We need to run two commands under "if_changed", so merge them into a +# single invocation. +quiet_cmd_check-and-link-vmlinux = LD $@ + cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE - $(call if_changed,check_data_rel) - $(call if_changed,ld) + $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 8c5107545251..9e2157371491 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,3 +1,4 @@ +#include <asm/e820/types.h> #include <asm/processor.h> #include "pgtable.h" #include "../string.h" @@ -34,10 +35,62 @@ unsigned long *trampoline_32bit __section(.data); extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); +static unsigned long find_trampoline_placement(void) +{ + unsigned long bios_start, ebda_start; + unsigned long trampoline_start; + struct boot_e820_entry *entry; + int i; + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Find the first usable memory region under bios_start. */ + for (i = boot_params->e820_entries - 1; i >= 0; i--) { + entry = &boot_params->e820_table[i]; + + /* Skip all entries above bios_start. */ + if (bios_start <= entry->addr) + continue; + + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + + /* Adjust bios_start to the end of the entry if needed. */ + if (bios_start > entry->addr + entry->size) + bios_start = entry->addr + entry->size; + + /* Keep bios_start page-aligned. */ + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Skip the entry if it's too small. */ + if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + continue; + + break; + } + + /* Place the trampoline just below the end of low memory */ + return bios_start - TRAMPOLINE_32BIT_SIZE; +} + struct paging_config paging_prepare(void *rmode) { struct paging_config paging_config = {}; - unsigned long bios_start, ebda_start; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ boot_params = rmode; @@ -61,23 +114,7 @@ struct paging_config paging_prepare(void *rmode) paging_config.l5_required = 1; } - /* - * Find a suitable spot for the trampoline. - * This code is based on reserve_bios_regions(). - */ - - ebda_start = *(unsigned short *)0x40e << 4; - bios_start = *(unsigned short *)0x413 << 10; - - if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) - bios_start = BIOS_START_MAX; - - if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) - bios_start = ebda_start; - - /* Place the trampoline just below the end of low memory, aligned to 4k */ - paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; - paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + paging_config.trampoline_start = find_trampoline_placement(); trampoline_32bit = (unsigned long *)paging_config.trampoline_start; diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73a522d53b53..8ae7ffda8f98 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -981,7 +981,7 @@ ENTRY(\sym) call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif END(\sym) .endm @@ -1222,7 +1222,6 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. - * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC @@ -1269,7 +1268,6 @@ ENTRY(error_entry) * for these here too. */ .Lerror_kernelspace: - incl %ebx leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) je .Lerror_bad_iret @@ -1303,28 +1301,20 @@ ENTRY(error_entry) /* * Pretend that the exception came from user mode: set up pt_regs - * as if we faulted immediately after IRET and clear EBX so that - * error_exit knows that we will be returning to user mode. + * as if we faulted immediately after IRET. */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp - decl %ebx jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) - -/* - * On entry, EBX is a "return to kernel mode" flag: - * 1: already in kernel mode, don't need SWAPGS - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ ENTRY(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %ebx, %ebx - jnz retint_kernel + testb $3, CS(%rsp) + jz retint_kernel jmp retint_user END(error_exit) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4b98101209a1..d50bb4dc0650 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); struct perf_event *event = pcpu->event; - struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event *hwc; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; @@ -602,6 +602,10 @@ fail: return 0; } + if (WARN_ON_ONCE(!event)) + goto fail; + + hwc = &event->hw; msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 707b2a96e516..86f0c15dcc2d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2997,6 +2997,9 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8cf03f101938..8dbba77e0518 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1186,16 +1186,20 @@ static void setup_pebs_sample_data(struct perf_event *event, } /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happend between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + /* * We use the interrupt regs as a base because the PEBS record does not * contain a full regs set, specifically it seems to lack segment * descriptors, which get used by things like user_mode(). * * In the simple case fix up only the IP for PERF_SAMPLE_IP. - * - * We must however always use BP,SP from iregs for the unwinder to stay - * sane; the record BP,SP can point into thin air when the record is - * from a previous PMI context or an (I)RET happend between the record - * and PMI. */ *regs = *iregs; @@ -1214,15 +1218,8 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->si = pebs->si; regs->di = pebs->di; - /* - * Per the above; only set BP,SP if we don't need callchains. - * - * XXX: does this make sense? - */ - if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { - regs->bp = pebs->bp; - regs->sp = pebs->sp; - } + regs->bp = pebs->bp; + regs->sp = pebs->sp; #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index c9e1e0bef3c3..e17ab885b1e9 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -28,7 +28,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 3 +#define UNCORE_EXTRA_PCI_DEV_MAX 4 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 87dc0263a2e1..51d7c117e3c7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1029,6 +1029,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + BDX_PCI_QPI_PORT2_FILTER, HSWEP_PCI_PCU_3, }; @@ -3286,15 +3287,18 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { }, { /* QPI Port 0 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), }, { /* QPI Port 1 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* QPI Port 2 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + BDX_PCI_QPI_PORT2_FILTER), }, { /* PCU.3 (for Capability registers) */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cfd29ee8c3da..59663c08c949 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -966,6 +966,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); +extern void free_kernel_image_pages(void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 9ef5ee03d2d7..159622ee0674 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -43,7 +43,7 @@ asm (".pushsection .text;" "push %rdx;" "mov $0x1,%eax;" "xor %edx,%edx;" - "lock cmpxchg %dl,(%rdi);" + LOCK_PREFIX "cmpxchg %dl,(%rdi);" "cmp $0x1,%al;" "jne .slowpath;" "pop %rdx;" diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index bd090367236c..34cffcef7375 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 62acb613114b..a9d637bc301d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -52,7 +52,12 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len) unsigned long ret; __uaccess_begin(); - ret = memcpy_mcsafe(to, from, len); + /* + * Note, __memcpy_mcsafe() is explicitly used since it can + * handle exceptions / faults. memcpy_mcsafe() may fall back to + * memcpy() which lacks this handling. + */ + ret = __memcpy_mcsafe(to, from, len); __uaccess_end(); return ret; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2aabd4cb0e3f..adbda5847b14 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,6 +573,9 @@ static u32 skx_deadline_rev(void) case 0x04: return 0x02000014; } + if (boot_cpu_data.x86_stepping > 4) + return 0; + return ~0U; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c102ad51025e..8c50754c09c1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2165,9 +2165,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index bf8d1eb7fca3..3b8e7c13c614 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -138,6 +138,7 @@ static unsigned long kvm_get_tsc_khz(void) src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); put_cpu(); + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return tsc_khz; } @@ -319,6 +320,8 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + pvclock_set_pvti_cpu0_va(hv_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -366,14 +369,11 @@ int __init kvm_setup_vsyscall_timeinfo(void) vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif return 0; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 92fd433c50b9..1bbec387d289 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -85,7 +85,7 @@ config KVM_AMD_SEV def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" depends on KVM_AMD && X86_64 - depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) ---help--- Provides support for launching Encrypted VMs on AMD processors. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d594690d8b95..6b8f11521c41 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -890,7 +890,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1689f433f3a0..5d8e317c2b04 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2571,6 +2571,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); + unsigned long fs_base, kernel_gs_base; #endif int i; @@ -2586,12 +2587,20 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 - save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; -#else - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); + if (likely(is_64bit_mm(current->mm))) { + save_fsgs_for_kvm(); + vmx->host_state.fs_sel = current->thread.fsindex; + vmx->host_state.gs_sel = current->thread.gsindex; + fs_base = current->thread.fsbase; + kernel_gs_base = current->thread.gsbase; + } else { +#endif + savesegment(fs, vmx->host_state.fs_sel); + savesegment(gs, vmx->host_state.gs_sel); +#ifdef CONFIG_X86_64 + fs_base = read_msr(MSR_FS_BASE); + kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + } #endif if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); @@ -2611,10 +2620,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); - vmcs_writel(HOST_FS_BASE, current->thread.fsbase); + vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = kernel_gs_base; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else @@ -4322,11 +4331,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs_conf->revision_id = KVM_EVMCS_VERSION; - else - vmcs_conf->revision_id = vmx_msr_low; + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -4396,7 +4401,13 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) return NULL; vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); - vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->revision_id = KVM_EVMCS_VERSION; + else + vmcs->revision_id = vmcs_config.revision_id; + return vmcs; } @@ -4564,6 +4575,19 @@ static __init int alloc_kvm_area(void) return -ENOMEM; } + /* + * When eVMCS is enabled, alloc_vmcs_cpu() sets + * vmcs->revision_id to KVM_EVMCS_VERSION instead of + * revision_id reported by MSR_IA32_VMX_BASIC. + * + * However, even though not explictly documented by + * TLFS, VMXArea passed as VMXON argument should + * still be marked with revision_id reported by + * physical CPU. + */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->revision_id = vmcs_config.revision_id; + per_cpu(vmxarea, cpu) = vmcs; } return 0; @@ -7869,6 +7893,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vpid02 = allocate_vpid(); + vmx->nested.vmxon = true; return 0; @@ -8456,21 +8482,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t vmcs_gva; + unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; + gva_t gva; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &vmcs_gva)) + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, vmcs_gva, - (void *)&to_vmx(vcpu)->nested.current_vmptr, - sizeof(u64), &e)) { + if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } @@ -10346,11 +10371,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) { + if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, kvm_vcpu_apicv_active(&vmx->vcpu)); - vmx->nested.vpid02 = allocate_vpid(); - } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -10367,7 +10390,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -11753,7 +11775,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 msr_entry_idx; u32 exit_qual; int r; @@ -11775,10 +11796,10 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) nested_get_vmcs12_pages(vcpu, vmcs12); r = EXIT_REASON_MSR_LOAD_FAIL; - msr_entry_idx = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) + exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (exit_qual) goto fail; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0046aa70205a..2b812b3c5088 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1097,6 +1097,7 @@ static u32 msr_based_features[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, }; static unsigned int num_msr_based_features; @@ -1105,7 +1106,8 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_UCODE_REV: - rdmsrl(msr->index, msr->data); + case MSR_IA32_ARCH_CAPABILITIES: + rdmsrl_safe(msr->index, &msr->data); break; default: if (kvm_x86_ops->get_msr_feature(msr)) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cee58a972cb2..74b157ac078d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -773,13 +773,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(void *begin, void *end) +{ + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + + free_init_pages("unused kernel image", begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); +} + void __ref free_initmem(void) { e820__reallocate_tables(); - free_init_pages("unused kernel", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + free_kernel_image_pages(&__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9b19f9a8948e..dd519f372169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1283,12 +1283,8 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(text_end)), - (unsigned long) __va(__pa_symbol(rodata_start))); - free_init_pages("unused kernel", - (unsigned long) __va(__pa_symbol(rodata_end)), - (unsigned long) __va(__pa_symbol(_sdata))); + free_kernel_image_pages((void *)text_end, (void *)rodata_start); + free_kernel_image_pages((void *)rodata_end, (void *)_sdata); debug_checkwx(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3bded76e8d5c..0a74996a1149 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + /* Has caller explicitly disabled alias checking? */ + if (in_flag & CPA_NO_CHECK_ALIAS) + checkalias = 0; ret = __change_page_attr_set_clr(&cpa, checkalias); @@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + int cpa_flags = CPA_NO_CHECK_ALIAS; + + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); +} + int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), @@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index bcf35dac1920..ef8db6ffc836 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -470,6 +470,13 @@ static inline bool pti_kernel_image_global_ok(void) } /* + * This is the only user for these and it is not arch-generic + * like the other set_memory.h functions. Just extern them. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +extern int set_memory_global(unsigned long addr, int numpages); + +/* * For some configurations, map all of kernel text into the user page * tables. This reduces TLB misses, especially on non-PCID systems. */ @@ -481,7 +488,8 @@ static void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_aligned; + unsigned long end_clone = (unsigned long)__end_rodata_aligned; + unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); if (!pti_kernel_image_global_ok()) return; @@ -493,15 +501,19 @@ static void pti_clone_kernel_text(void) * pti_set_kernel_image_nonglobal() did to clear the * global bit. */ - pti_clone_pmds(start, end, 0); + pti_clone_pmds(start, end_clone, _PAGE_RW); + + /* + * pti_clone_pmds() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); } -/* - * This is the only user for it and it is not arch-generic like - * the other set_memory.h functions. Just extern it. - */ -extern int set_memory_nonglobal(unsigned long addr, int numpages); -static void pti_set_kernel_image_nonglobal(void) +void pti_set_kernel_image_nonglobal(void) { /* * The identity map is created with PMDs, regardless of the @@ -512,9 +524,11 @@ static void pti_set_kernel_image_nonglobal(void) unsigned long start = PFN_ALIGN(_text); unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); - if (pti_kernel_image_global_ok()) - return; - + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 55799873ebe5..8f6cc71e0848 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1441,8 +1441,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* sub esp,STACK_SIZE */ EMIT2_off32(0x81, 0xEC, STACK_SIZE); - /* sub ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16); + /* sub ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 12); /* xor ebx,ebx */ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX)); @@ -1475,8 +1475,8 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) /* mov edx,dword ptr [ebp+off]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1])); - /* add ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16); + /* add ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 12); /* mov ebx,dword ptr [ebp-12]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77873ce700ae..5f2eb3231607 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -417,7 +417,7 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; - if (sev_active()) + if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO) flags |= _PAGE_ENC; pfn = md->phys_addr >> PAGE_SHIFT; diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 744afdc18cf3..56c44d865f7b 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) if (!FIXADDR_USER_START) return 0; - gate_vma.vm_mm = NULL; + vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; |