From fc4fad79fc3d8841562e2a85808079da5b4835f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Dec 2021 23:24:36 +0000 Subject: KVM: VMX: Reject KVM_RUN if emulation is required with pending exception Reject KVM_RUN if emulation is required (because VMX is running without unrestricted guest) and an exception is pending, as KVM doesn't support emulating exceptions except when emulating real mode via vm86. The vCPU is hosed either way, but letting KVM_RUN proceed triggers a WARN due to the impossible condition. Alternatively, the WARN could be removed, but then userspace and/or KVM bugs would result in the vCPU silently running in a bad state, which isn't very friendly to users. Originally, the bug was hit by syzkaller with a nested guest as that doesn't require kvm_intel.unrestricted_guest=0. That particular flavor is likely fixed by commit cd0e615c49e5 ("KVM: nVMX: Synthesize TRIPLE_FAULT for L2 if emulation is required"), but it's trivial to trigger the WARN with a non-nested guest, and userspace can likely force bad state via ioctls() for a nested guest as well. Checking for the impossible condition needs to be deferred until KVM_RUN because KVM can't force specific ordering between ioctls. E.g. clearing exception.pending in KVM_SET_SREGS doesn't prevent userspace from setting it in KVM_SET_VCPU_EVENTS, and disallowing KVM_SET_VCPU_EVENTS with emulation_required would prevent userspace from queuing an exception and then stuffing sregs. Note, if KVM were to try and detect/prevent the condition prior to KVM_RUN, handle_invalid_guest_state() and/or handle_emulation_failure() would need to be modified to clear the pending exception prior to exiting to userspace. ------------[ cut here ]------------ WARNING: CPU: 6 PID: 137812 at arch/x86/kvm/vmx/vmx.c:1623 vmx_queue_exception+0x14f/0x160 [kvm_intel] CPU: 6 PID: 137812 Comm: vmx_invalid_nes Not tainted 5.15.2-7cc36c3e14ae-pop #279 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:vmx_queue_exception+0x14f/0x160 [kvm_intel] Code: <0f> 0b e9 fd fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 RSP: 0018:ffffa45c83577d38 EFLAGS: 00010202 RAX: 0000000000000003 RBX: 0000000080000006 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000010002 RDI: ffff9916af734000 RBP: ffff9916af734000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000006 R13: 0000000000000000 R14: ffff9916af734038 R15: 0000000000000000 FS: 00007f1e1a47c740(0000) GS:ffff99188fb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1e1a6a8008 CR3: 000000026f83b005 CR4: 00000000001726e0 Call Trace: kvm_arch_vcpu_ioctl_run+0x13a2/0x1f20 [kvm] kvm_vcpu_ioctl+0x279/0x690 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+82112403ace4cbd780d8@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20211228232437.1875318-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 15e30602782b..41aaa37d9eb8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7708,6 +7725,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, -- cgit v1.2.3 From d76fb40637fc0e84b27bf431cd72cf8fe3f813ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:14 +0000 Subject: KVM: VMX: Handle PI descriptor updates during vcpu_put/load Move the posted interrupt pre/post_block logic into vcpu_put/load respectively, using the kvm_vcpu_is_blocking() to determining whether or not the wakeup handler needs to be set (and unset). This avoids updating the PI descriptor if halt-polling is successful, reduces the number of touchpoints for updating the descriptor, and eliminates the confusing behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU is scheduled back in after preemption. The downside is that KVM will do the PID update twice if the vCPU is preempted after prepare_to_rcuwait() but before schedule(), but that's a rare case (and non-existent on !PREEMPT kernels). The notable wart is the need to send a self-IPI on the wakeup vector if an outstanding notification is pending after configuring the wakeup vector. Ideally, KVM would just do a kvm_vcpu_wake_up() in this case, but the scheduler doesn't support waking a task from its preemption notifier callback, i.e. while the task is right in the middle of being scheduled out. Note, setting the wakeup vector before halt-polling is not necessary: once the pending IRQ will be recorded in the PIR, kvm_vcpu_has_events() will detect this (via kvm_cpu_get_interrupt(), kvm_apic_get_interrupt(), apic_has_interrupt_for_ppr() and finally vmx_sync_pir_to_irr()) and terminate the polling. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 150 ++++++++++++++++++----------------------- arch/x86/kvm/vmx/posted_intr.h | 8 ++- arch/x86/kvm/vmx/vmx.c | 5 -- 3 files changed, 70 insertions(+), 93 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index a82a15aa1982..63e2399f353a 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -52,6 +52,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* @@ -62,23 +63,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!enable_apicv || !lapic_in_kernel(vcpu)) return; - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + /* + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; return; + } + + local_irq_save(flags); /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + list_del(&vcpu->blocked_vcpu_list); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); } - /* The full case. Set the new destination and clear SN. */ dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -86,10 +98,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) do { old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ new.ndst = dest; new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); + local_irq_restore(flags); + after_clear_sn: /* @@ -111,83 +135,24 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) irq_remapping_cap(IRQ_POSTING_CAP); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!vmx_can_use_vtd_pi(vcpu->kvm)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which - * will not be the same as the current pCPU if the task was migrated. - */ - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - - dest = cpu_physical_id(vcpu->cpu); - if (!x2apic_mode) - dest = (dest << 8) & 0xFF00; - - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the vCPU was blocking"); - - do { - old.control = new.control = READ_ONCE(pi_desc->control); - - new.ndst = dest; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); - - vcpu->pre_pcpu = -1; -} - /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; unsigned long flags; - if (!vmx_can_use_vtd_pi(vcpu->kvm) || - vmx_interrupt_blocked(vcpu)) - return 0; - local_irq_save(flags); - vcpu->pre_pcpu = vcpu->cpu; raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn == 1, - "Posted Interrupt Suppress Notification set before blocking"); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); do { old.control = new.control = READ_ONCE(pi_desc->control); @@ -196,24 +161,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc)) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); - return (vcpu->pre_pcpu == -1); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - unsigned long flags; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (vcpu->pre_pcpu == -1) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - local_irq_save(flags); - __pi_post_block(vcpu); - local_irq_restore(flags); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -254,7 +232,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 36ae035f14aa..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 41aaa37d9eb8..3a9a49a87b9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,9 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (pi_pre_block(vcpu)) - return 1; - if (kvm_lapic_hv_timer_in_use(vcpu)) kvm_lapic_switch_to_sw_timer(vcpu); @@ -7579,8 +7576,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); } static void vmx_setup_mce(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 12a8eee5686ef3ea7d8db90cd664f11e4a39e349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:16 +0000 Subject: KVM: Move x86 VMX's posted interrupt list_head to vcpu_vmx Move the seemingly generic block_vcpu_list from kvm_vcpu to vcpu_vmx, and rename the list and all associated variables to clarify that it tracks the set of vCPU that need to be poked on a posted interrupt to the wakeup vector. The list is not used to track _all_ vCPUs that are blocking, and the term "blocked" can be misleading as it may refer to a blocking condition in the host or the guest, where as the PI wakeup case is specifically for the vCPUs that are actively blocking from within the guest. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 39 ++++++++++++++++++++------------------- arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/vmx/vmx.h | 3 +++ include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 2 -- 5 files changed, 25 insertions(+), 23 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 63e2399f353a..901eea44cf24 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -19,7 +19,7 @@ * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the @@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ -static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -51,6 +51,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -86,9 +87,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } dest = cpu_physical_id(cpu); @@ -142,15 +143,16 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; local_irq_save(flags); - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); @@ -199,24 +201,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc)) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_kick(&vmx->vcpu); } - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3a9a49a87b9d..1840898bb184 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6943,6 +6943,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8fc7441baea..7f2c82e7f38f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -317,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3c67b6318f..f079820f52b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,8 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - struct list_head blocked_vcpu_list; - struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c301cd8d583e..5a1164483e6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -427,8 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) #endif kvm_async_pf_vcpu_init(vcpu); - INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; -- cgit v1.2.3 From 98c25ead5eda5e9d41abe57839ad3e8caf19500c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:17 +0000 Subject: KVM: VMX: Move preemption timer <=> hrtimer dance to common x86 Handle the switch to/from the hypervisor/software timer when a vCPU is blocking in common x86 instead of in VMX. Even though VMX is the only user of a hypervisor timer, the logic and all functions involved are generic x86 (unless future CPUs do something completely different and implement a hypervisor timer that runs regardless of mode). Handling the switch in common x86 will allow for the elimination of the pre/post_blocks hooks, and also lets KVM switch back to the hypervisor timer if and only if it was in use (without additional params). Add a comment explaining why the switch cannot be deferred to kvm_sched_out() or kvm_vcpu_block(). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +----- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1840898bb184..70be166833ac 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7568,16 +7568,12 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - return 0; } static void vmx_post_block(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); + } static void vmx_setup_mce(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49ff85e966d7..943706a09e6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10146,8 +10146,21 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { + bool hv_timer; + if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) kvm_vcpu_halt(vcpu); @@ -10155,6 +10168,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + if (kvm_x86_ops.post_block) static_call(kvm_x86_post_block)(vcpu); @@ -10349,6 +10365,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* + * It should be impossible for the hypervisor timer to be in + * use before KVM has ever run the vCPU. + */ + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); kvm_vcpu_block(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; -- cgit v1.2.3 From c3e8abf0f3536a46a235b0533149c2b2c2bbac27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:19 +0000 Subject: KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks Drop kvm_x86_ops' pre/post_block() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 -- arch/x86/include/asm/kvm_host.h | 12 ------------ arch/x86/kvm/vmx/vmx.c | 13 ------------- arch/x86/kvm/x86.c | 6 +----- 4 files changed, 1 insertion(+), 32 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 65c3f1857d6e..631d5040b31e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6a8fa26ef98c..682ad02a4e58 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1454,18 +1454,6 @@ struct kvm_x86_ops { const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 70be166833ac..9ab98e1bd65a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,16 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7777,9 +7767,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 943706a09e6e..4b9728a53de6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10148,8 +10148,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { /* * Switch to the software timer before halt-polling/blocking as * the guest's timer may be a break event for the vCPU, and the @@ -10171,9 +10170,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } -- cgit v1.2.3 From 0f65a9d337676b966316db17374fbef910ab8e4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:26 +0000 Subject: KVM: VMX: Don't do full kick when triggering posted interrupt "fails" Replace the full "kick" with just the "wake" in the fallback path when triggering a virtual interrupt via a posted interrupt fails because the guest is not IN_GUEST_MODE. If the guest transitions into guest mode between the check and the kick, then it's guaranteed to see the pending interrupt as KVM syncs the PIR to IRR (and onto GUEST_RVI) after setting IN_GUEST_MODE. Kicking the guest in this case is nothing more than an unnecessary VM-Exit (and host IRQ). Opportunistically update comments to explain the various ordering rules and barriers at play. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9ab98e1bd65a..ac3e943ec2a4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3998,7 +3998,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } return -1; @@ -4036,7 +4036,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b9728a53de6..55518b7d3b96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9978,10 +9978,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. Assigned devices can - * use the POSTED_INTR_VECTOR even if APICv is disabled, - * so do it even if APICv is disabled on this vCPU. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); -- cgit v1.2.3 From 296aa26644d088d8ccf0d62b0a93443f7188d5e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:28 +0000 Subject: KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ Refactor the posted interrupt helper to take the desired notification vector instead of a bool so that the callers are self-documenting. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ac3e943ec2a4..063c21559aa1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3932,11 +3932,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3997,7 +3995,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; } @@ -4035,7 +4033,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; -- cgit v1.2.3 From ccf8d687542f6a7288b79727bec1cc084b3771b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:29 +0000 Subject: KVM: VMX: Fold fallback path into triggering posted IRQ helper Move the fallback "wake_up" path into the helper to trigger posted interrupt helper now that the nested and non-nested paths are identical. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/vmx/vmx.c') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 063c21559aa1..a02a28ce7cc3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3931,7 +3931,7 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec) { #ifdef CONFIG_SMP @@ -3962,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3995,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) - kvm_vcpu_wake_up(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -4033,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) - kvm_vcpu_wake_up(vcpu); - + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } -- cgit v1.2.3