arch/x86/kvm/vmx/posted_intr.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kvm_host.h>
#include <linux/kvm_irqfd.h>

#include <asm/irq_remapping.h>
#include <asm/cpu.h>

#include "lapic.h"
#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
#include "tdx.h"

/*
 * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
 * when a WAKEUP_VECTOR interrupted is posted.  vCPUs are added to the list when
 * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
 * The vCPUs posted interrupt descriptor is updated at the same time to set its
 * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
 * wake the target vCPUs.  vCPUs are removed from the list and the notification
 * vector is reset when the vCPU is scheduled in.
 */
static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
/*
 * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
 * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
 * ->sched_in() path will need to take the vCPU off the list of the _previous_
 * CPU.  IRQs must be disabled when taking this lock, otherwise deadlock will
 * occur if a wakeup IRQ arrives and attempts to acquire the lock.
 */
static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);

#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING

static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
	return &(to_vt(vcpu)->pi_desc);
}

static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
{
	/*
	 * PID.ON can be set at any time by a different vCPU or by hardware,
	 * e.g. a device.  PID.control must be written atomically, and the
	 * update must be retried with a fresh snapshot an ON change causes
	 * the cmpxchg to fail.
	 */
	if (!try_cmpxchg64(&pi_desc->control, pold, new))
		return -EBUSY;

	return 0;
}

void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct vcpu_vt *vt = to_vt(vcpu);
	struct pi_desc old, new;
	unsigned long flags;
	unsigned int dest;

	/*
	 * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
	 * PI.SN up-to-date even if there is no assigned device or if APICv is
	 * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
	 */
	if (!enable_apicv || !lapic_in_kernel(vcpu))
		return;

	/*
	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
	 * full update can be skipped as neither the vector nor the destination
	 * needs to be changed.  Clear SN even if there is no assigned device,
	 * again for simplicity.
	 */
	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
		if (pi_test_and_clear_sn(pi_desc))
			goto after_clear_sn;
		return;
	}

	local_irq_save(flags);

	/*
	 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
	 * list of the _previous_ pCPU, which will not be the same as the
	 * current pCPU if the task was migrated.
	 */
	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
		raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);

		/*
		 * In addition to taking the wakeup lock for the regular/IRQ
		 * context, tell lockdep it is being taken for the "sched out"
		 * context as well.  vCPU loads happens in task context, and
		 * this is taking the lock of the *previous* CPU, i.e. can race
		 * with both the scheduler and the wakeup handler.
		 */
		raw_spin_lock(spinlock);
		spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
		list_del(&vt->pi_wakeup_list);
		spin_release(&spinlock->dep_map, _RET_IP_);
		raw_spin_unlock(spinlock);
	}

	dest = cpu_physical_id(cpu);
	if (!x2apic_mode)
		dest = (dest << 8) & 0xFF00;

	old.control = READ_ONCE(pi_desc->control);
	do {
		new.control = old.control;

		/*
		 * Clear SN (as above) and refresh the destination APIC ID to
		 * handle task migration (@cpu != vcpu->cpu).
		 */
		new.ndst = dest;
		__pi_clear_sn(&new);

		/*
		 * Restore the notification vector; in the blocking case, the
		 * descriptor was modified on "put" to use the wakeup vector.
		 */
		new.nv = POSTED_INTR_VECTOR;
	} while (pi_try_set_control(pi_desc, &old.control, new.control));

	local_irq_restore(flags);

after_clear_sn:

	/*
	 * Clear SN before reading the bitmap.  The VT-d firmware
	 * writes the bitmap and reads SN atomically (5.2.3 in the
	 * spec), so it doesn't really have a memory barrier that
	 * pairs with this, but we cannot do that and we need one.
	 */
	smp_mb__after_atomic();

	if (!pi_is_pir_empty(pi_desc))
		pi_set_on(pi_desc);
}

static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
	/*
	 * Note, reading the number of possible bypass IRQs can race with a
	 * bypass IRQ being attached to the VM.  vmx_pi_start_bypass() ensures
	 * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
	 */
	return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
	       READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}

/*
 * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
 * WAKEUP as the notification vector in the PI descriptor.
 */
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct vcpu_vt *vt = to_vt(vcpu);
	struct pi_desc old, new;

	lockdep_assert_irqs_disabled();

	/*
	 * Acquire the wakeup lock using the "sched out" context to workaround
	 * a lockdep false positive.  When this is called, schedule() holds
	 * various per-CPU scheduler locks.  When the wakeup handler runs, it
	 * holds this CPU's wakeup lock while calling try_to_wake_up(), which
	 * can eventually take the aforementioned scheduler locks, which causes
	 * lockdep to assume there is deadlock.
	 *
	 * Deadlock can't actually occur because IRQs are disabled for the
	 * entirety of the sched_out critical section, i.e. the wakeup handler
	 * can't run while the scheduler locks are held.
	 */
	raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
			     PI_LOCK_SCHED_OUT);
	list_add_tail(&vt->pi_wakeup_list,
		      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));

	WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");

	old.control = READ_ONCE(pi_desc->control);
	do {
		/* set 'NV' to 'wakeup vector' */
		new.control = old.control;
		new.nv = POSTED_INTR_WAKEUP_VECTOR;
	} while (pi_try_set_control(pi_desc, &old.control, new.control));

	/*
	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
	 * before the notification vector was updated, in which case the IRQ
	 * will arrive on the non-wakeup vector.  An IPI is needed as calling
	 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
	 * enabled until it is safe to call try_to_wake_up() on the task being
	 * scheduled out).
	 */
	if (pi_test_on(&new))
		__apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
}

static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
{
	/*
	 * The default posted interrupt vector does nothing when
	 * invoked outside guest mode.   Return whether a blocked vCPU
	 * can be the target of posted interrupts, as is the case when
	 * using either IPI virtualization or VT-d PI, so that the
	 * notification vector is switched to the one that calls
	 * back to the pi_wakeup_handler() function.
	 */
	return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) ||
		vmx_can_use_vtd_pi(vcpu->kvm);
}

void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	if (!vmx_needs_pi_wakeup(vcpu))
		return;

	/*
	 * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
	 * enable the wakeup handler so that notification IRQ wakes the vCPU as
	 * expected.  There is no need to enable the wakeup handler if the vCPU
	 * is preempted between setting its wait state and manually scheduling
	 * out, as the task is still runnable, i.e. doesn't need a wake event
	 * from KVM to be scheduled in.
	 *
	 * If the wakeup handler isn't being enabled, Suppress Notifications as
	 * the cost of propagating PIR.IRR to PID.ON is negligible compared to
	 * the cost of a spurious IRQ, and vCPU put/load is a slow path.
	 */
	if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
	    ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
	     (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
		pi_enable_wakeup_handler(vcpu);
	else
		pi_set_sn(pi_desc);
}

/*
 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 */
void pi_wakeup_handler(void)
{
	int cpu = smp_processor_id();
	struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
	raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
	struct vcpu_vt *vt;

	raw_spin_lock(spinlock);
	list_for_each_entry(vt, wakeup_list, pi_wakeup_list) {

		if (pi_test_on(&vt->pi_desc))
			kvm_vcpu_wake_up(vt_to_vcpu(vt));
	}
	raw_spin_unlock(spinlock);
}

void __init pi_init_cpu(int cpu)
{
	INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
	raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}

void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi = vcpu_to_pi_desc(vcpu);

	pi_clear_on(pi);
	memset(pi->pir, 0, sizeof(pi->pir));
}

bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	return pi_test_on(pi_desc) ||
		(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
}


/*
 * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
 * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
 * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
 */
void vmx_pi_start_bypass(struct kvm *kvm)
{
	if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
		return;

	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}

int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
		       unsigned int host_irq, uint32_t guest_irq,
		       struct kvm_vcpu *vcpu, u32 vector)
{
	if (vcpu) {
		struct intel_iommu_pi_data pi_data = {
			.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
			.vector = vector,
		};

		return irq_set_vcpu_affinity(host_irq, &pi_data);
	} else {
		return irq_set_vcpu_affinity(host_irq, NULL);
	}
}