diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-16 01:43:55 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-16 01:43:55 +0300 |
commit | 9db59599ae502b38b27cff6462273f84acd59927 (patch) | |
tree | 96d90a2f7bcddc837987579ad2d3e58b891db716 | |
parent | b38923a068c10fc36ca8f596d650d095ce390b85 (diff) | |
parent | 4f350c6dbcb9000e18907515ec8a7b205ac33c69 (diff) | |
download | linux-9db59599ae502b38b27cff6462273f84acd59927.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more KVM updates from Paolo Bonzini:
- PPC bugfixes
- RCU splat fix
- swait races fix
- pointless userspace-triggerable BUG() fix
- misc fixes for KVM_RUN corner cases
- nested virt correctness fixes + one host DoS
- some cleanups
- clang build fix
- fix AMD AVIC with default QEMU command line options
- x86 bugfixes
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly
kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly
kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry
kvm,mips: Fix potential swait_active() races
kvm,powerpc: Serialize wq active checks in ops->vcpu_kick
kvm: Serialize wq active checks in kvm_vcpu_wake_up()
kvm,x86: Fix apf_task_wake_one() wq serialization
kvm,lapic: Justify use of swait_active()
kvm,async_pf: Use swq_has_sleeper()
sched/wait: Add swq_has_sleeper()
KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
KVM: Don't accept obviously wrong gsi values via KVM_IRQFD
kvm: nVMX: Don't allow L2 to access the hardware CR8
KVM: trace events: update list of exit reasons
KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously
KVM: X86: Don't block vCPU if there is pending exception
KVM: SVM: Add irqchip_split() checks before enabling AVIC
KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv()
KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()
KVM: x86: fix clang build
...
-rw-r--r-- | arch/mips/kvm/mips.c | 4 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 4 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_xive.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive_template.c | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 38 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 162 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 51 | ||||
-rw-r--r-- | include/linux/swait.h | 58 | ||||
-rw-r--r-- | include/trace/events/kvm.h | 4 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 6 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 2 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 3 |
18 files changed, 257 insertions, 111 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index bce2a6431430..d535edc01434 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; - if (swait_active(&dvcpu->wq)) + if (swq_has_sleeper(&dvcpu->wq)) swake_up(&dvcpu->wq); return 0; @@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data) kvm_mips_callbacks->queue_timer_int(vcpu); vcpu->arch.wait = 0; - if (swait_active(&vcpu->wq)) + if (swq_has_sleeper(&vcpu->wq)) swake_up(&vcpu->wq); } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 18e974a34fce..73bf1ebfa78f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (swait_active(wqp)) { + if (swq_has_sleeper(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; } @@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + mutex_lock(&kvm->lock); kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + mutex_unlock(&kvm->lock); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c index abf5f01b6eb1..5b81a807d742 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c @@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void) #define __x_tima get_tima_phys() #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) -#define __x_readb __raw_rm_readb #define __x_writeb __raw_rm_writeb #define __x_readw __raw_rm_readw #define __x_readq __raw_rm_readq diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 663a4a861e7f..17936f82d3c7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) /* * Are we running hash or radix ? */ - beq cr2,3f + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi cr2, r0, 0 + beq cr2, 3f /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) @@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ ld r9, HSTATE_KVM_VCPU(r13) bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) @@ -2578,6 +2590,9 @@ kvm_end_cede: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 08b200a0bbce..13304622ab1c 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -48,7 +48,6 @@ #define __x_tima xive_tima #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) -#define __x_readb __raw_readb #define __x_writeb __raw_writeb #define __x_readw __raw_readw #define __x_readq __raw_readq diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index d1ed2c41b5d2..c7a5deadd1cc 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) * bit. */ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr >= xc->hw_cppr) return; } @@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; u8 pending = xc->pending; u32 hirq; - u8 pipr; pr_devel("H_IPOLL(server=%ld)\n", server); @@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long pending = 0xff; } else { /* Grab pending interrupt if any */ - pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr < 8) pending |= 1 << pipr; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8844eee290b2..c73e493adf07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -951,7 +951,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -973,7 +972,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(void); + bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 874827b0d7ca..aa60a08b65b1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (swait_active(&n->wq)) + else if (swq_has_sleeper(&n->wq)) swake_up(&n->wq); } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1ea3c0e1e3a9..0bc5c1315708 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; - BUILD_BUG_ON(!__builtin_constant_p(x86_leaf)); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index aaf10b6f5380..69c5612be786 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ if (swait_active(q)) swake_up(q); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2c1cfe68a9af..0e68f0b3cbf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - svm->vcpu.arch.apicv_active = true; } static void init_vmcb(struct vcpu_svm *svm) @@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } - if (avic) + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); /* @@ -1600,6 +1599,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +static int avic_init_vcpu(struct vcpu_svm *svm) +{ + int ret; + + if (!kvm_vcpu_apicv_active(&svm->vcpu)) + return 0; + + ret = avic_init_backing_page(&svm->vcpu); + if (ret) + return ret; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + + return ret; +} + static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; @@ -1636,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; - if (avic) { - err = avic_init_backing_page(&svm->vcpu); - if (err) - goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); - } + err = avic_init_vcpu(svm); + if (err) + goto free_page4; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -4395,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static bool svm_get_enable_apicv(void) +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { - return avic; + return avic && irqchip_split(vcpu->kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -4414,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return; vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; @@ -5302,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, */ if (info->rep_prefix != REPE_PREFIX) goto out; + break; case SVM_EXIT_IOIO: { u64 exit_info; u32 bytes; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 699704d4bc9e..06c0c6d0541e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ } } -static bool vmx_get_enable_apicv(void) +static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; } @@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) + return false; + + if (unlikely(vmx->fail)) { + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); + return true; + } /* * The host physical addresses of some pages of guest memory @@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) */ nested_mark_vmcs12_pages_dirty(vcpu); - if (vmx->nested.nested_run_pending) - return false; - - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: @@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - /* * eager fpu is enabled if PKEY is supported and CR4 is switched * back on host, so it is safe to read guest PKRU from current @@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; + + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; + + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); @@ -10525,6 +10527,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & CPU_BASED_TPR_SHADOW) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } /* @@ -11388,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. + */ + WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD)); + leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (likely(!vmx->fail)) { + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); - if (unlikely(vmx->fail)) - vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + } vmx_switch_vmcs(vcpu, &vmx->vmcs01); - - /* - * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of - * the VM-exit interrupt information (valid interrupt) is always set to - * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need - * kvm_cpu_has_interrupt(). See the commit message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -11436,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - load_vmcs12_host_state(vcpu, vmcs12); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); @@ -11486,21 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - /* - * Exiting from L2 to L1, we're now back to L1 which thinks it just - * finished a VMLAUNCH or VMRESUME instruction, so we need to set the - * success or failure flag accordingly. - */ - if (unlikely(vmx->fail)) { - vmx->fail = 0; - nested_vmx_failValid(vcpu, vm_inst_error); - } else - nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + /* + * The emulated instruction was already skipped in + * nested_vmx_run, but the updated RIP was never + * written back to the vmcs01. + */ + skip_emulated_instruction(vcpu); + vmx->fail = 0; } /* @@ -11829,7 +11854,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11838,7 +11863,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6069af86da3b..cd17b7d9a107 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7231,10 +7231,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (kvm_run->immediate_exit) { + r = -EINTR; + goto out; + } kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } goto out; } @@ -7971,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) @@ -8452,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; + if (vcpu->arch.exception.pending) + return true; + if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu))) @@ -8619,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) sizeof(val)); } +static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, + sizeof(u32)); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -8646,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; + u32 val; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -8653,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->gva); - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && + !apf_get_user(vcpu, &val)) { + if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && + vcpu->arch.exception.pending && + vcpu->arch.exception.nr == PF_VECTOR && + !apf_put_user(vcpu, 0)) { + vcpu->arch.exception.injected = false; + vcpu->arch.exception.pending = false; + vcpu->arch.exception.nr = 0; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.error_code = 0; + } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + fault.async_page_fault = true; + kvm_inject_page_fault(vcpu, &fault); + } } vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/include/linux/swait.h b/include/linux/swait.h index 4a4e180d0a35..73e97a08d3d0 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name DECLARE_SWAIT_QUEUE_HEAD(name) #endif -static inline int swait_active(struct swait_queue_head *q) +/** + * swait_active -- locklessly test for waiters on the queue + * @wq: the waitqueue to test for waiters + * + * returns true if the wait list is not empty + * + * NOTE: this function is lockless and requires care, incorrect usage _will_ + * lead to sporadic and non-obvious failure. + * + * NOTE2: this function has the same above implications as regular waitqueues. + * + * Use either while holding swait_queue_head::lock or when used for wakeups + * with an extra smp_mb() like: + * + * CPU0 - waker CPU1 - waiter + * + * for (;;) { + * @cond = true; prepare_to_swait(&wq_head, &wait, state); + * smp_mb(); // smp_mb() from set_current_state() + * if (swait_active(wq_head)) if (@cond) + * wake_up(wq_head); break; + * schedule(); + * } + * finish_swait(&wq_head, &wait); + * + * Because without the explicit smp_mb() it's possible for the + * swait_active() load to get hoisted over the @cond store such that we'll + * observe an empty wait list while the waiter might not observe @cond. + * This, in turn, can trigger missing wakeups. + * + * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), + * which (when the lock is uncontended) are of roughly equal cost. + */ +static inline int swait_active(struct swait_queue_head *wq) +{ + return !list_empty(&wq->task_list); +} + +/** + * swq_has_sleeper - check if there are any waiting processes + * @wq: the waitqueue to test for waiters + * + * Returns true if @wq has waiting processes + * + * Please refer to the comment for swait_active. + */ +static inline bool swq_has_sleeper(struct swait_queue_head *wq) { - return !list_empty(&q->task_list); + /* + * We need to be sure we are in sync with the list_add() + * modifications to the wait queue (task_list). + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return swait_active(wq); } extern void swake_up(struct swait_queue_head *q); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 8ade3eb6c640..dcffedfac431 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -14,7 +14,9 @@ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ - ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH) + ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ + ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ + ERSN(HYPERV) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index bb298a200cd3..57bcb27dcf30 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); - /* - * This memory barrier pairs with prepare_to_wait's set_current_state() - */ - smp_mb(); - if (swait_active(&vcpu->wq)) + if (swq_has_sleeper(&vcpu->wq)) swake_up(&vcpu->wq); mmput(mm); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f2ac53ab8243..c608ab495282 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; + if (args->gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) return kvm_irqfd_deassign(kvm, args); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6ed1c2021198..9deb5a245b83 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -674,6 +674,7 @@ out_err_no_irq_srcu: out_err_no_srcu: hardware_disable_all(); out_err_no_disable: + refcount_set(&kvm->users_count, 0); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) @@ -2186,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (swait_active(wqp)) { + if (swq_has_sleeper(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; return true; |