diff options
Diffstat (limited to 'arch/x86')
26 files changed, 203 insertions, 110 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c77c5d8a7b3e..307529417021 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -178,11 +178,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif -ifdef CONFIG_LTO_CLANG -KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ - -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) -endif - # Workaround for a gcc prelease that unfortunately was shipped in a suse release KBUILD_CFLAGS += -Wno-sign-compare # @@ -202,7 +197,12 @@ ifdef CONFIG_RETPOLINE endif endif -KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) + +ifdef CONFIG_LTO_CLANG +KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ + -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +endif ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs --discard-none diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 28a1423ce32e..4bbc267fb36b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,7 +447,7 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr -443 i386 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ecd551b08d05..ce18119ea0d0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,7 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e509325c2c3..8f71dd72ef95 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -396,10 +396,12 @@ int x86_reserve_hardware(void) if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { - if (!reserve_pmc_hardware()) + if (!reserve_pmc_hardware()) { err = -EBUSY; - else + } else { reserve_ds_buffers(); + reserve_lbr_buffers(); + } } if (!err) atomic_inc(&pmc_refcount); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2521d03de5e0..e28892270c58 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6253,7 +6253,7 @@ __init int intel_pmu_init(void) * Check all LBT MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ - if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) x86_pmu.lbr_nr = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 76dbab6ac9fb..4409d2cccfda 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -658,7 +658,6 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { - struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) @@ -696,11 +695,6 @@ void intel_pmu_lbr_add(struct perf_event *event) perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); - - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && - kmem_cache && !cpuc->lbr_xsave && - (cpuc->lbr_users != cpuc->lbr_pebs_users)) - cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); } void release_lbr_buffers(void) @@ -722,6 +716,26 @@ void release_lbr_buffers(void) } } +void reserve_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + kmem_cache = x86_get_pmu(cpu)->task_ctx_cache; + if (!kmem_cache || cpuc->lbr_xsave) + continue; + + cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL, + cpu_to_node(cpu)); + } +} + void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 27fa85e7d4fd..ad87cb36f7c8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1244,6 +1244,8 @@ void reserve_ds_buffers(void); void release_lbr_buffers(void); +void reserve_lbr_buffers(void); + extern struct event_constraint bts_constraint; extern struct event_constraint vlbr_constraint; @@ -1393,6 +1395,10 @@ static inline void release_lbr_buffers(void) { } +static inline void reserve_lbr_buffers(void) +{ +} + static inline int intel_pmu_init(void) { return 0; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 323641097f63..e7bef91cee04 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(start_assignment) KVM_X86_OP_NULL(apicv_post_state_restore) KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) KVM_X86_OP_NULL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55efbacfc244..9c7ced0e3171 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1352,6 +1352,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 6ec8b3bfd76e..9f90f460a28c 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -63,6 +63,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { + ghcb->save.sw_exit_code = 0; memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 9578c82832aa..651b81cd648e 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -203,8 +203,18 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) if (unlikely(data->ghcb_active)) { /* GHCB is already in use - save its contents */ - if (unlikely(data->backup_ghcb_active)) - return NULL; + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } /* Mark backup_ghcb active before writing to it */ data->backup_ghcb_active = true; @@ -221,24 +231,6 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) return ghcb; } -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - data->ghcb_active = false; - } -} - /* Needed in vc_early_forward_exception */ void do_early_exception(struct pt_regs *regs, int trapnr); @@ -323,31 +315,44 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(target, size)) { - memcpy(dst, buf, size); - return ES_OK; - } - + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ switch (size) { case 1: memcpy(&d1, buf, 1); - if (put_user(d1, target)) + if (__put_user(d1, target)) goto fault; break; case 2: memcpy(&d2, buf, 2); - if (put_user(d2, target)) + if (__put_user(d2, target)) goto fault; break; case 4: memcpy(&d4, buf, 4); - if (put_user(d4, target)) + if (__put_user(d4, target)) goto fault; break; case 8: memcpy(&d8, buf, 8); - if (put_user(d8, target)) + if (__put_user(d8, target)) goto fault; break; default: @@ -378,30 +383,43 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(s, size)) { - memcpy(buf, src, size); - return ES_OK; - } - + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ switch (size) { case 1: - if (get_user(d1, s)) + if (__get_user(d1, s)) goto fault; memcpy(buf, &d1, 1); break; case 2: - if (get_user(d2, s)) + if (__get_user(d2, s)) goto fault; memcpy(buf, &d2, 2); break; case 4: - if (get_user(d4, s)) + if (__get_user(d4, s)) goto fault; memcpy(buf, &d4, 4); break; case 8: - if (get_user(d8, s)) + if (__get_user(d8, s)) goto fault; memcpy(buf, &d8, 8); break; @@ -461,6 +479,29 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + void noinstr __sev_es_nmi_complete(void) { struct ghcb_state state; @@ -1255,6 +1296,10 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) case X86_TRAP_UD: exc_invalid_op(ctxt->regs); break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; case X86_TRAP_AC: exc_alignment_check(ctxt->regs, error_code); break; @@ -1284,7 +1329,6 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) */ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; @@ -1310,16 +1354,6 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) */ ghcb = sev_es_get_ghcb(&state); - if (!ghcb) { - /* - * Mark GHCBs inactive so that panic() is able to print the - * message. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - } vc_ghcb_invalidate(ghcb); result = vc_init_em_ctxt(&ctxt, regs, error_code); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 0e5d0a7e203b..06743ec054d2 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -127,6 +127,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); @@ -138,8 +141,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a0ccdb56076..5e5de05a8fbf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5111,7 +5111,7 @@ done: return rc; } -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -5322,7 +5322,8 @@ done_prefixes: ctxt->execute = opcode.u.execute; - if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && + likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; if (unlikely(ctxt->d & diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f98370a39936..f00830e5202f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); u64 gfn; + int idx; if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || @@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; hv->tsc_ref.tsc_sequence = 0; + + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&kvm->srcu); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + srcu_read_unlock(&kvm->srcu, idx); out_unlock: mutex_unlock(&hv->hv_lock); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index f016838faedd..3e870bf9ca4d 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -314,7 +314,6 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ - bool ud; /* inject an #UD if host doesn't support insn */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; @@ -491,7 +490,7 @@ enum x86_intercept { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c0ebef560bd1..8120e8614b92 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1598,11 +1598,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; + if (lapic_timer_advance_dynamic) { + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); + /* + * If the timer fired early, reread the TSC to account for the + * overhead of the above adjustment to avoid waiting longer + * than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + } + if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - - if (lapic_timer_advance_dynamic) - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) @@ -1661,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(&apic->lapic_timer.pending); - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 95eeb5ac6a8a..237317b1eddd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1192,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 712b4e0de481..0e62e6a2438c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -28,10 +28,8 @@ #include "svm.h" /* enable / disable AVIC */ -int avic; -#ifdef CONFIG_X86_LOCAL_APIC -module_param(avic, int, S_IRUGO); -#endif +bool avic; +module_param(avic, bool, S_IRUGO); #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05eca131eaf2..e088086f3de6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1010,9 +1010,7 @@ static __init int svm_hardware_setup(void) } if (avic) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2c9ece618b29..2908c6ab5bb4 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -480,7 +480,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern int avic; +extern bool avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8dee8a5fbc17..aa0e7872fcc9 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void) static inline bool cpu_has_vmx_posted_intr(void) { - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_load_ia32_efer(void) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 459748680daf..5f81ef092bd4 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -238,6 +238,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) /* + * Bail out of the block loop if the VM has an assigned + * device, but the blocking vCPU didn't reconfigure the + * PI.NV to the wakeup vector, i.e. the assigned device + * came along after the initial check in pi_pre_block(). + */ +void vmx_pi_start_assignment(struct kvm *kvm) +{ + if (!irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); +} + +/* * pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 0bdc41391c5b..7f7b2326caf5 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bceb5ca3a89..50b42d7a8a11 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4843,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; @@ -4933,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: @@ -7721,6 +7720,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nested_ops = &vmx_nested_ops, .update_pi_irte = pi_update_irte, + .start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbc4e04e67ad..b594275d49b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3105,6 +3105,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + } else { + st->preempted = 0; } vcpu->arch.st.preempted = 0; @@ -7226,6 +7228,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -7561,14 +7568,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; @@ -8360,6 +8360,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) vcpu->stat.directed_yield_attempted++; + if (single_task_running()) + goto no_yield; + rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); @@ -9496,7 +9499,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -10115,8 +10118,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); /* * Trigger an rflags update that will inject or remove the trace @@ -11499,7 +11501,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { - atomic_inc(&kvm->arch.assigned_device_count); + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) + static_call_cond(kvm_x86_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 17503fed2017..e87699aa2dc8 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1273,16 +1273,16 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Get mfn list */ xen_build_dynamic_phys_to_machine(); + /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); + x86_configure_nx(); + /* * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed. */ xen_setup_gdt(0); - /* Work out if we support NX */ - get_cpu_cap(&boot_cpu_data); - x86_configure_nx(); - /* Determine virtual and physical address sizes */ get_cpu_address_sizes(&boot_cpu_data); |