From cd4178d19420359554e3da6fd77ecfd0f58067ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Jun 2025 16:51:47 -0700 Subject: KVM: arm64: WARN if unmapping a vLPI fails in any path When unmapping a vLPI, WARN if nullifying vCPU affinity fails, not just if failure occurs when freeing an ITE. If undoing vCPU affinity fails, then odds are very good that vLPI state tracking has has gotten out of whack, i.e. that KVM and the GIC disagree on the state of an IRQ/vLPI. At best, inconsistent state means there is a lurking bug/flaw somewhere. At worst, the inconsistency could eventually be fatal to the host, e.g. if an ITS command fails because KVM's view of things doesn't match reality/hardware. Note, only the call from kvm_arch_irq_bypass_del_producer() by way of kvm_vgic_v4_unset_forwarding() doesn't already WARN. Common KVM's kvm_irq_routing_update() WARNs if kvm_arch_update_irqfd_routing() fails. For that path, if its_unmap_vlpi() fails in kvm_vgic_v4_unset_forwarding(), the only possible causes are that the GIC doesn't have a v4 ITS (from its_irq_set_vcpu_affinity()): /* Need a v4 ITS */ if (!is_v4(its_dev->its)) return -EINVAL; guard(raw_spinlock)(&its_dev->event_map.vlpi_lock); /* Unmap request? */ if (!info) return its_vlpi_unmap(d); or that KVM has gotten out of sync with the GIC/ITS (from its_vlpi_unmap()): if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d)) return -EINVAL; All of the above failure scenarios are warnable offences, as they should never occur absent a kernel/KVM bug. Acked-by: Oliver Upton Link: https://lore.kernel.org/all/aFWY2LTVIxz5rfhh@google.com Signed-off-by: Sean Christopherson --- include/linux/irqchip/arm-gic-v4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 7f1f11a5e4e4..0b0887099fd7 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe); int its_invall_vpe(struct its_vpe *vpe); int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); -int its_unmap_vlpi(int irq); +void its_unmap_vlpi(int irq); int its_prop_update_vlpi(int irq, u8 config, bool inv); int its_prop_update_vsgi(int irq, u8 priority, bool group); -- cgit v1.2.3 From 2b521d86ee80a436a92445b8206d38d75aeb39ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:29 -0700 Subject: irqbypass: Take ownership of producer/consumer token tracking Move ownership of IRQ bypass token tracking into irqbypass.ko, and explicitly require callers to pass an eventfd_ctx structure instead of a completely opaque token. Relying on producers and consumers to set the token appropriately is error prone, and hiding the fact that the token must be an eventfd_ctx pointer (for all intents and purposes) unnecessarily obfuscates the code and makes it more brittle. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 ++-- drivers/vfio/pci/vfio_pci_intrs.c | 9 +++----- drivers/vhost/vdpa.c | 8 +++---- include/linux/irqbypass.h | 35 +++++++++++++++++-------------- virt/kvm/eventfd.c | 7 +++---- virt/lib/irqbypass.c | 44 +++++++++++++++++++++++++-------------- 6 files changed, 58 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b58a74c1722d..3dc93e2d4777 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13669,8 +13669,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); + printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration" + " fails: %d\n", irqfd->consumer.eventfd, ret); spin_unlock_irq(&kvm->irqfds.lock); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 565966351dfa..d87fe116762a 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -505,15 +505,12 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (ret) goto out_put_eventfd_ctx; - ctx->producer.token = trigger; ctx->producer.irq = irq; - ret = irq_bypass_register_producer(&ctx->producer); + ret = irq_bypass_register_producer(&ctx->producer, trigger); if (unlikely(ret)) { dev_info(&pdev->dev, - "irq bypass producer (token %p) registration fails: %d\n", - ctx->producer.token, ret); - - ctx->producer.token = NULL; + "irq bypass producer (eventfd %p) registration fails: %d\n", + trigger, ret); } ctx->trigger = trigger; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 5a49b5a6d496..7b265ffda697 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -213,10 +213,10 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) return; vq->call_ctx.producer.irq = irq; - ret = irq_bypass_register_producer(&vq->call_ctx.producer); + ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx); if (unlikely(ret)) - dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n", - qid, vq->call_ctx.producer.token, ret); + dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n", + qid, vq->call_ctx.ctx, ret); } static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid) @@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) vhost_vdpa_unsetup_vq_irq(v, idx); - vq->call_ctx.producer.token = NULL; } break; } @@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; cb.trigger = vq->call_ctx.ctx; - vq->call_ctx.producer.token = vq->call_ctx.ctx; if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) vhost_vdpa_setup_vq_irq(v, idx); diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 9bdb2a781841..1b57d15ac4cf 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -10,6 +10,7 @@ #include +struct eventfd_ctx; struct irq_bypass_consumer; /* @@ -18,20 +19,20 @@ struct irq_bypass_consumer; * The IRQ bypass manager is a simple set of lists and callbacks that allows * IRQ producers (ex. physical interrupt sources) to be matched to IRQ * consumers (ex. virtualization hardware that allows IRQ bypass or offload) - * via a shared token (ex. eventfd_ctx). Producers and consumers register - * independently. When a token match is found, the optional @stop callback - * will be called for each participant. The pair will then be connected via - * the @add_* callbacks, and finally the optional @start callback will allow - * any final coordination. When either participant is unregistered, the - * process is repeated using the @del_* callbacks in place of the @add_* - * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings - * are not supported. + * via a shared eventfd_ctx. Producers and consumers register independently. + * When a producer and consumer are paired, i.e. an eventfd match is found, the + * optional @stop callback will be called for each participant. The pair will + * then be connected via the @add_* callbacks, and finally the optional @start + * callback will allow any final coordination. When either participant is + * unregistered, the process is repeated using the @del_* callbacks in place of + * the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N + * pairings are not supported. */ /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -44,7 +45,7 @@ struct irq_bypass_consumer; */ struct irq_bypass_producer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -57,7 +58,7 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management - * @token: opaque token to match between producer and consumer (non-NULL) + * @eventfd: eventfd context used to match producers and consumers * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -70,7 +71,7 @@ struct irq_bypass_producer { */ struct irq_bypass_consumer { struct list_head node; - void *token; + struct eventfd_ctx *eventfd; int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, @@ -79,9 +80,11 @@ struct irq_bypass_consumer { void (*start)(struct irq_bypass_consumer *); }; -int irq_bypass_register_producer(struct irq_bypass_producer *); -void irq_bypass_unregister_producer(struct irq_bypass_producer *); -int irq_bypass_register_consumer(struct irq_bypass_consumer *); -void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd); +void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer); #endif /* IRQBYPASS_H */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 11e5d1e3f12e..5bc6abe30748 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -426,15 +426,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (kvm_arch_has_irq_bypass()) { - irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; - ret = irq_bypass_register_consumer(&irqfd->consumer); + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); if (ret) - pr_info("irq bypass consumer (token %p) registration fails: %d\n", - irqfd->consumer.token, ret); + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", + irqfd->eventfd, ret); } #endif diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 28a4d933569a..e8d7c420db52 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -77,30 +77,32 @@ static void __disconnect(struct irq_bypass_producer *prod, /** * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure + * @eventfd: pointer to the eventfd context associated with the producer * * Add the provided IRQ producer to the list of producers and connect - * with any matching token found on the IRQ consumers list. + * with any matching eventfd found on the IRQ consumers list. */ -int irq_bypass_register_producer(struct irq_bypass_producer *producer) +int irq_bypass_register_producer(struct irq_bypass_producer *producer, + struct eventfd_ctx *eventfd) { struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; int ret; - if (!producer->token) + if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token == producer->token) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -108,6 +110,7 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer) } } + producer->eventfd = eventfd; list_add(&producer->node, &producers); mutex_unlock(&lock); @@ -131,26 +134,28 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) struct irq_bypass_producer *tmp; struct irq_bypass_consumer *consumer; - if (!producer->token) + if (!producer->eventfd) return; mutex_lock(&lock); list_for_each_entry(tmp, &producers, node) { - if (tmp->token != producer->token) + if (tmp->eventfd != producer->eventfd) continue; list_for_each_entry(consumer, &consumers, node) { - if (consumer->token == producer->token) { + if (consumer->eventfd == producer->eventfd) { __disconnect(producer, consumer); break; } } + producer->eventfd = NULL; list_del(&producer->node); break; } + WARN_ON_ONCE(producer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -158,31 +163,35 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); /** * irq_bypass_register_consumer - register IRQ bypass consumer * @consumer: pointer to consumer structure + * @eventfd: pointer to the eventfd context associated with the consumer * * Add the provided IRQ consumer to the list of consumers and connect - * with any matching token found on the IRQ producer list. + * with any matching eventfd found on the IRQ producer list. */ -int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) +int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, + struct eventfd_ctx *eventfd) { struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; int ret; - if (!consumer->token || - !consumer->add_producer || !consumer->del_producer) + if (WARN_ON_ONCE(consumer->eventfd)) + return -EINVAL; + + if (!consumer->add_producer || !consumer->del_producer) return -EINVAL; mutex_lock(&lock); list_for_each_entry(tmp, &consumers, node) { - if (tmp->token == consumer->token || tmp == consumer) { + if (tmp->eventfd == eventfd) { ret = -EBUSY; goto out_err; } } list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == eventfd) { ret = __connect(producer, consumer); if (ret) goto out_err; @@ -190,6 +199,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) } } + consumer->eventfd = eventfd; list_add(&consumer->node, &consumers); mutex_unlock(&lock); @@ -213,7 +223,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) struct irq_bypass_consumer *tmp; struct irq_bypass_producer *producer; - if (!consumer->token) + if (!consumer->eventfd) return; mutex_lock(&lock); @@ -223,16 +233,18 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) continue; list_for_each_entry(producer, &producers, node) { - if (producer->token == consumer->token) { + if (producer->eventfd == consumer->eventfd) { __disconnect(producer, consumer); break; } } + consumer->eventfd = NULL; list_del(&consumer->node); break; } + WARN_ON_ONCE(consumer->eventfd); mutex_unlock(&lock); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From add57f493e0893ac0fb4acbdc441918d3e800f10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:30 -0700 Subject: irqbypass: Explicitly track producer and consumer bindings Explicitly track IRQ bypass producer:consumer bindings. This will allow making removal an O(1) operation; searching through the list to find information that is trivially tracked (and useful for debug) is wasteful. Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 7 +++++++ virt/lib/irqbypass.c | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index 1b57d15ac4cf..b28197c87483 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -29,10 +29,13 @@ struct irq_bypass_consumer; * pairings are not supported. */ +struct irq_bypass_consumer; + /** * struct irq_bypass_producer - IRQ bypass producer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) @@ -46,6 +49,7 @@ struct irq_bypass_consumer; struct irq_bypass_producer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_consumer *consumer; int irq; int (*add_consumer)(struct irq_bypass_producer *, struct irq_bypass_consumer *); @@ -59,6 +63,7 @@ struct irq_bypass_producer { * struct irq_bypass_consumer - IRQ bypass consumer definition * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers + * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer * @del_producer: Disconnect the IRQ consumer from an IRQ producer * @stop: Perform any quiesce operations necessary prior to add/del (optional) @@ -72,6 +77,8 @@ struct irq_bypass_producer { struct irq_bypass_consumer { struct list_head node; struct eventfd_ctx *eventfd; + struct irq_bypass_producer *producer; + int (*add_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); void (*del_producer)(struct irq_bypass_consumer *, diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index e8d7c420db52..fdbf7ecc0c21 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->start) prod->start(prod); + if (!ret) { + prod->consumer = cons; + cons->producer = prod; + } return ret; } @@ -72,6 +76,9 @@ static void __disconnect(struct irq_bypass_producer *prod, cons->start(cons); if (prod->start) prod->start(prod); + + prod->consumer = NULL; + cons->producer = NULL; } /** @@ -145,6 +152,7 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) list_for_each_entry(consumer, &consumers, node) { if (consumer->eventfd == producer->eventfd) { + WARN_ON_ONCE(producer->consumer != consumer); __disconnect(producer, consumer); break; } @@ -234,6 +242,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) list_for_each_entry(producer, &producers, node) { if (producer->eventfd == consumer->eventfd) { + WARN_ON_ONCE(consumer->producer != producer); __disconnect(producer, consumer); break; } -- cgit v1.2.3 From 8394b32faecd9c63b3c436e78e62519e9548e530 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:33 -0700 Subject: irqbypass: Use xarray to track producers and consumers Track IRQ bypass producers and consumers using an xarray to avoid the O(2n) insertion time associated with walking a list to check for duplicate entries, and to search for an partner. At low (tens or few hundreds) total producer/consumer counts, using a list is faster due to the need to allocate backing storage for xarray. But as count creeps into the thousands, xarray wins easily, and can provide several orders of magnitude better latency at high counts. E.g. hundreds of nanoseconds vs. hundreds of milliseconds. Cc: Oliver Upton Cc: David Matlack Cc: Like Xu Cc: Binbin Wu Reported-by: Yong He Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379 Link: https://lore.kernel.org/all/20230801115646.33990-1-likexu@tencent.com Reviewed-by: Kevin Tian Acked-by: Michael S. Tsirkin Reviewed-by: Alex Williamson Link: https://lore.kernel.org/r/20250516230734.2564775-8-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/irqbypass.h | 4 --- virt/lib/irqbypass.c | 74 ++++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index b28197c87483..cd64fcaa88fe 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -33,7 +33,6 @@ struct irq_bypass_consumer; /** * struct irq_bypass_producer - IRQ bypass producer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @consumer: The connected consumer (NULL if no connection) * @irq: Linux IRQ number for the producer device @@ -47,7 +46,6 @@ struct irq_bypass_consumer; * for a physical device assigned to a VM. */ struct irq_bypass_producer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_consumer *consumer; int irq; @@ -61,7 +59,6 @@ struct irq_bypass_producer { /** * struct irq_bypass_consumer - IRQ bypass consumer definition - * @node: IRQ bypass manager private list management * @eventfd: eventfd context used to match producers and consumers * @producer: The connected producer (NULL if no connection) * @add_producer: Connect the IRQ consumer to an IRQ producer @@ -75,7 +72,6 @@ struct irq_bypass_producer { * portions of the interrupt handling to the VM. */ struct irq_bypass_consumer { - struct list_head node; struct eventfd_ctx *eventfd; struct irq_bypass_producer *producer; diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index 828556c081f5..ea888b9203d2 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -22,8 +22,8 @@ MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IRQ bypass manager utility module"); -static LIST_HEAD(producers); -static LIST_HEAD(consumers); +static DEFINE_XARRAY(producers); +static DEFINE_XARRAY(consumers); static DEFINE_MUTEX(lock); /* @lock must be held when calling connect */ @@ -86,13 +86,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer * - * Add the provided IRQ producer to the list of producers and connect - * with any matching eventfd found on the IRQ consumers list. + * Add the provided IRQ producer to the set of producers and connect with the + * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, struct eventfd_ctx *eventfd) { - struct irq_bypass_producer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; int ret; @@ -101,22 +101,20 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, guard(mutex)(&lock); - list_for_each_entry(tmp, &producers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&producers, index, producer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(consumer, &consumers, node) { - if (consumer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + consumer = xa_load(&consumers, index); + if (consumer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&producers, index) != producer); + return ret; } } producer->eventfd = eventfd; - list_add(&producer->node, &producers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_producer); @@ -125,11 +123,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer); * irq_bypass_unregister_producer - unregister IRQ bypass producer * @producer: pointer to producer structure * - * Remove a previously registered IRQ producer from the list of producers - * and disconnect it from any connected IRQ consumer. + * Remove a previously registered IRQ producer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * consumer, if one exists. */ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) { + unsigned long index = (unsigned long)producer->eventfd; + if (!producer->eventfd) return; @@ -138,8 +139,8 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) if (producer->consumer) __disconnect(producer, producer->consumer); + WARN_ON_ONCE(xa_erase(&producers, index) != producer); producer->eventfd = NULL; - list_del(&producer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); @@ -148,13 +149,13 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); * @consumer: pointer to consumer structure * @eventfd: pointer to the eventfd context associated with the consumer * - * Add the provided IRQ consumer to the list of consumers and connect - * with any matching eventfd found on the IRQ producer list. + * Add the provided IRQ consumer to the set of consumers and connect with the + * producer with a matching eventfd, if one exists. */ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd) { - struct irq_bypass_consumer *tmp; + unsigned long index = (unsigned long)eventfd; struct irq_bypass_producer *producer; int ret; @@ -166,22 +167,20 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, guard(mutex)(&lock); - list_for_each_entry(tmp, &consumers, node) { - if (tmp->eventfd == eventfd) - return -EBUSY; - } + ret = xa_insert(&consumers, index, consumer, GFP_KERNEL); + if (ret) + return ret; - list_for_each_entry(producer, &producers, node) { - if (producer->eventfd == eventfd) { - ret = __connect(producer, consumer); - if (ret) - return ret; - break; + producer = xa_load(&producers, index); + if (producer) { + ret = __connect(producer, consumer); + if (ret) { + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); + return ret; } } consumer->eventfd = eventfd; - list_add(&consumer->node, &consumers); return 0; } EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); @@ -190,11 +189,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); * irq_bypass_unregister_consumer - unregister IRQ bypass consumer * @consumer: pointer to consumer structure * - * Remove a previously registered IRQ consumer from the list of consumers - * and disconnect it from any connected IRQ producer. + * Remove a previously registered IRQ consumer (note, it's safe to call this + * even if registration was unsuccessful). Disconnect from the associated + * producer, if one exists. */ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) { + unsigned long index = (unsigned long)consumer->eventfd; + if (!consumer->eventfd) return; @@ -203,7 +205,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) if (consumer->producer) __disconnect(consumer->producer, consumer); + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); consumer->eventfd = NULL; - list_del(&consumer->node); } EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer); -- cgit v1.2.3 From 23b54381cee2928e8b5622e654ca4516f30d2f1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 16 May 2025 16:07:34 -0700 Subject: irqbypass: Require producers to pass in Linux IRQ number during registration Pass in the Linux IRQ associated with an IRQ bypass producer instead of relying on the caller to set the field prior to registration, as there's no benefit to relying on callers to do the right thing. Take care to set producer->irq before __connect(), as KVM expects the IRQ to be valid as soon as a connection is possible. Reviewed-by: Kevin Tian Reviewed-by: Alex Williamson Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20250516230734.2564775-9-seanjc@google.com Signed-off-by: Sean Christopherson --- drivers/vfio/pci/vfio_pci_intrs.c | 3 +-- drivers/vhost/vdpa.c | 4 ++-- include/linux/irqbypass.h | 2 +- virt/lib/irqbypass.c | 5 ++++- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index d87fe116762a..123298a4dc8f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -505,8 +505,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (ret) goto out_put_eventfd_ctx; - ctx->producer.irq = irq; - ret = irq_bypass_register_producer(&ctx->producer, trigger); + ret = irq_bypass_register_producer(&ctx->producer, trigger, irq); if (unlikely(ret)) { dev_info(&pdev->dev, "irq bypass producer (eventfd %p) registration fails: %d\n", diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 7b265ffda697..af1e1fdfd9ed 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -212,8 +212,8 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) if (!vq->call_ctx.ctx) return; - vq->call_ctx.producer.irq = irq; - ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx); + ret = irq_bypass_register_producer(&vq->call_ctx.producer, + vq->call_ctx.ctx, irq); if (unlikely(ret)) dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n", qid, vq->call_ctx.ctx, ret); diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h index cd64fcaa88fe..ede1fa938152 100644 --- a/include/linux/irqbypass.h +++ b/include/linux/irqbypass.h @@ -84,7 +84,7 @@ struct irq_bypass_consumer { }; int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd); + struct eventfd_ctx *eventfd, int irq); void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, struct eventfd_ctx *eventfd); diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index ea888b9203d2..62c160200be9 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -85,12 +85,13 @@ static void __disconnect(struct irq_bypass_producer *prod, * irq_bypass_register_producer - register IRQ bypass producer * @producer: pointer to producer structure * @eventfd: pointer to the eventfd context associated with the producer + * @irq: Linux IRQ number of the underlying producer device * * Add the provided IRQ producer to the set of producers and connect with the * consumer with a matching eventfd, if one exists. */ int irq_bypass_register_producer(struct irq_bypass_producer *producer, - struct eventfd_ctx *eventfd) + struct eventfd_ctx *eventfd, int irq) { unsigned long index = (unsigned long)eventfd; struct irq_bypass_consumer *consumer; @@ -99,6 +100,8 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer, if (WARN_ON_ONCE(producer->eventfd)) return -EINVAL; + producer->irq = irq; + guard(mutex)(&lock); ret = xa_insert(&producers, index, producer, GFP_KERNEL); -- cgit v1.2.3 From e295d2e7fbe69ddec772c951c466dfbfc1c96818 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:40 -0700 Subject: KVM: x86: Trigger I/O APIC route rescan in kvm_arch_irq_routing_update() Trigger the I/O APIC route rescan that's performed for a split IRQ chip after userspace updates IRQ routes in kvm_arch_irq_routing_update(), i.e. before dropping kvm->irq_lock. Calling kvm_make_all_cpus_request() under a mutex is perfectly safe, and the smp_wmb()+smp_mb__after_atomic() pair in __kvm_make_request()+kvm_check_request() ensures the new routing is visible to vCPUs prior to the request being visible to vCPUs. In all likelihood, commit b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference") somewhat arbitrarily made the request outside of irq_lock to avoid holding irq_lock any longer than is strictly necessary. And then commit abdb080f7ac8 ("kvm/irqchip: kvm_arch_irq_routing_update renaming split") took the easy route of adding another arch hook instead of risking a functional change. Note, the call to synchronize_srcu_expedited() does NOT provide ordering guarantees with respect to vCPUs scanning the new routing; as above, the request infrastructure provides the necessary ordering. I.e. there's no need to wait for kvm_scan_ioapic_routes() to complete if it's actively running, because regardless of whether it grabs the old or new table, the vCPU will have another KVM_REQ_SCAN_IOAPIC pending, i.e. will rescan again and see the new mappings. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq_comm.c | 10 +++------- include/linux/kvm_host.h | 4 ---- virt/kvm/irqchip.c | 2 -- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d6d792b5d1bd..e2ae62ff9cc2 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ - if (!irqchip_split(kvm)) - return; - kvm_make_scan_ioapic_request(kvm); -} - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, u8 vector, unsigned long *ioapic_handled_vectors) { @@ -466,4 +459,7 @@ void kvm_arch_irq_routing_update(struct kvm *kvm) #ifdef CONFIG_KVM_HYPERV kvm_hv_irq_routing_update(kvm); #endif + + if (irqchip_split(kvm)) + kvm_make_scan_ioapic_request(kvm); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bde4fb5c6aa..9461517b4e62 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1024,14 +1024,10 @@ void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); -void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } -static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) -{ -} #endif #ifdef CONFIG_HAVE_KVM_IRQCHIP diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 162d8ed889f2..6ccabfd32287 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm, kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); - kvm_arch_post_irq_routing_update(kvm); - synchronize_srcu_expedited(&kvm->irq_srcu); new = old; -- cgit v1.2.3 From 77a74b8ff41ae620f5a5d727d596b670b7b9994e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:48 -0700 Subject: KVM: x86: Move kvm_{request,free}_irq_source_id() to i8254.c (PIT) Move kvm_{request,free}_irq_source_id() to i8254.c, i.e. the dedicated PIT emulation file, in anticipation of removing them entirely in favor of hardcoding the PIT's "requested" source ID (the source ID can only ever be '2', and the request can never fail). No functional change intended. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/i8254.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/irq_comm.c | 44 -------------------------------------------- include/linux/kvm_host.h | 2 -- 3 files changed, 44 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59f956f35f4c..2bb223bf0dac 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -641,6 +641,50 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } +static int kvm_request_irq_source_id(struct kvm *kvm) +{ + unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); + + if (irq_source_id >= BITS_PER_LONG) { + pr_warn("exhausted allocatable IRQ sources!\n"); + irq_source_id = -EFAULT; + goto unlock; + } + + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); + + return irq_source_id; +} + +static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) +{ + ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); + + mutex_lock(&kvm->irq_lock); + if (irq_source_id < 0 || + irq_source_id >= BITS_PER_LONG) { + pr_err("IRQ source ID out of range!\n"); + goto unlock; + } + clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_full(kvm)) + goto unlock; + + kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); +unlock: + mutex_unlock(&kvm->irq_lock); +} + static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 99c521bd9db5..138c675dc24b 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -165,50 +165,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -int kvm_request_irq_source_id(struct kvm *kvm) -{ - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id; - - mutex_lock(&kvm->irq_lock); - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - - if (irq_source_id >= BITS_PER_LONG) { - pr_warn("exhausted allocatable IRQ sources!\n"); - irq_source_id = -EFAULT; - goto unlock; - } - - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - set_bit(irq_source_id, bitmap); -unlock: - mutex_unlock(&kvm->irq_lock); - - return irq_source_id; -} - -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) -{ - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - - mutex_lock(&kvm->irq_lock); - if (irq_source_id < 0 || - irq_source_id >= BITS_PER_LONG) { - pr_err("IRQ source ID out of range!\n"); - goto unlock; - } - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_full(kvm)) - goto unlock; - - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); -unlock: - mutex_unlock(&kvm->irq_lock); -} - void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9461517b4e62..cba8fc4529e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1784,8 +1784,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); -int kvm_request_irq_source_id(struct kvm *kvm); -void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* -- cgit v1.2.3 From 61423c413a746fd5fe5b0d865ea722e11b01105e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:49 -0700 Subject: KVM: x86: Hardcode the PIT IRQ source ID to '2' Hardcode the PIT's source IRQ ID to '2' instead of "finding" that bit 2 is always the first available bit in irq_sources_bitmap. Bits 0 and 1 are set/reserved by kvm_arch_init_vm(), i.e. long before kvm_create_pit() can be invoked, and KVM allows at most one in-kernel PIT instance, i.e. it's impossible for the PIT to find a different free bit (there are no other users of kvm_request_irq_source_id(). Delete the now-defunct irq_sources_bitmap and all its associated code. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/i8254.c | 55 ++++++----------------------------------- arch/x86/kvm/i8254.h | 1 - arch/x86/kvm/x86.c | 6 ----- include/linux/kvm_host.h | 1 + 5 files changed, 8 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bf4459d637cc..4d68680f7051 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1397,7 +1397,6 @@ struct kvm_arch { bool pause_in_guest; bool cstate_in_guest; - unsigned long irq_sources_bitmap; s64 kvmclock_offset; /* diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 2bb223bf0dac..fa8187608cfc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work) if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) return; - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false); + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -641,47 +641,11 @@ static void kvm_pit_reset(struct kvm_pit *pit) kvm_pit_reset_reinject(pit); } -static int kvm_request_irq_source_id(struct kvm *kvm) +static void kvm_pit_clear_all(struct kvm *kvm) { - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id; - - mutex_lock(&kvm->irq_lock); - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - - if (irq_source_id >= BITS_PER_LONG) { - pr_warn("exhausted allocatable IRQ sources!\n"); - irq_source_id = -EFAULT; - goto unlock; - } - - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - set_bit(irq_source_id, bitmap); -unlock: - mutex_unlock(&kvm->irq_lock); - - return irq_source_id; -} - -static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) -{ - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); - mutex_lock(&kvm->irq_lock); - if (irq_source_id < 0 || - irq_source_id >= BITS_PER_LONG) { - pr_err("IRQ source ID out of range!\n"); - goto unlock; - } - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!irqchip_full(kvm)) - goto unlock; - - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); -unlock: + kvm_ioapic_clear_all(kvm->arch.vioapic, KVM_PIT_IRQ_SOURCE_ID); + kvm_pic_clear_all(kvm->arch.vpic, KVM_PIT_IRQ_SOURCE_ID); mutex_unlock(&kvm->irq_lock); } @@ -788,10 +752,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (!pit) return NULL; - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) - goto fail_request; - mutex_init(&pit->pit_state.lock); pid = get_pid(task_tgid(current)); @@ -843,8 +803,7 @@ fail_register_pit: kvm_pit_set_reinject(pit, false); kthread_destroy_worker(pit->worker); fail_kthread: - kvm_free_irq_source_id(kvm, pit->irq_source_id); -fail_request: + kvm_pit_clear_all(kvm); kfree(pit); return NULL; } @@ -861,7 +820,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_pit_clear_all(kvm); kfree(pit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 338095829ec8..b9c1feb379a7 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,7 +44,6 @@ struct kvm_pit { struct kvm_io_device speaker_dev; struct kvm *kvm; struct kvm_kpit_state pit_state; - int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; struct kthread_worker *worker; struct kthread_work expired; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53f2ce2d40de..1d744730985e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12669,12 +12669,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); atomic_set(&kvm->arch.noncoherent_dma_count, 0); - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); - raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cba8fc4529e8..4ff5ea29e343 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 +#define KVM_PIT_IRQ_SOURCE_ID 2 extern struct mutex kvm_lock; extern struct list_head vm_list; -- cgit v1.2.3 From 628a27731e3f36de7ddce226f7e09ee70e40ed66 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 14:35:53 -0700 Subject: KVM: x86: Add CONFIG_KVM_IOAPIC to allow disabling in-kernel I/O APIC Add a Kconfig to allow building KVM without support for emulating a I/O APIC, PIC, and PIT, which is desirable for deployments that effectively don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to create an in-kernel I/O APIC. E.g. compiling out support eliminates a few thousand lines of guest-facing code and gives security folks warm fuzzies. As a bonus, wrapping relevant paths with CONFIG_KVM_IOAPIC #ifdefs makes it much easier for readers to understand which bits and pieces exist specifically for fully in-kernel IRQ chips. Opportunistically convert all two in-kernel uses of __KVM_HAVE_IOAPIC to CONFIG_KVM_IOAPIC, e.g. rather than add a second #ifdef to generate a stub for kvm_arch_post_irq_routing_update(). Acked-by: Kai Huang Link: https://lore.kernel.org/r/20250611213557.294358-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/Kconfig | 10 ++++++++++ arch/x86/kvm/Makefile | 5 +++-- arch/x86/kvm/i8254.h | 2 ++ arch/x86/kvm/irq.c | 8 ++++++++ arch/x86/kvm/irq.h | 9 +++++++++ arch/x86/kvm/irq_comm.c | 2 ++ arch/x86/kvm/lapic.c | 7 ++++++- arch/x86/kvm/trace.h | 2 ++ arch/x86/kvm/x86.c | 22 ++++++++++++++++++---- include/linux/kvm_host.h | 2 +- include/trace/events/kvm.h | 4 ++-- 12 files changed, 65 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4649a234f05..8d511eb03933 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1375,9 +1375,11 @@ struct kvm_arch { atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE atomic_t assigned_device_count; +#ifdef CONFIG_KVM_IOAPIC struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; +#endif atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map __rcu *apic_map; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2eeffcec5382..2c86673155c9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -166,6 +166,16 @@ config KVM_AMD_SEV Encrypted State (SEV-ES), and Secure Encrypted Virtualization with Secure Nested Paging (SEV-SNP) technologies on AMD processors. +config KVM_IOAPIC + bool "I/O APIC, PIC, and PIT emulation" + default y + depends on KVM + help + Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. + for full in-kernel APIC emulation. + + If unsure, say Y. + config KVM_SMM bool "System Management Mode emulation" default y diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a5d362c7b504..92c737257789 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -5,12 +5,13 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror include $(srctree)/virt/kvm/Makefile.kvm -kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ +kvm-y += x86.o emulate.o irq.o lapic.o \ + irq_comm.o cpuid.o pmu.o mtrr.o \ debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o +kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o kvm-$(CONFIG_KVM_HYPERV) += hyperv.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index b9c1feb379a7..e8bd59ad8a7c 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,6 +8,7 @@ #include +#ifdef CONFIG_KVM_IOAPIC struct kvm_kpit_channel_state { u32 count; /* can be 65536 */ u16 latched_count; @@ -64,5 +65,6 @@ int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); +#endif /* CONFIG_KVM_IOAPIC */ #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index fb3bad0f4965..4c219e9f52b0 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -76,8 +76,10 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v) if (!kvm_apic_accept_pic_intr(v)) return 0; +#ifdef CONFIG_KVM_IOAPIC if (pic_in_kernel(v->kvm)) return v->kvm->arch.vpic->output; +#endif WARN_ON_ONCE(!irqchip_split(v->kvm)); return pending_userspace_extint(v); @@ -136,8 +138,10 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v) return v->kvm->arch.xen.upcall_vector; #endif +#ifdef CONFIG_KVM_IOAPIC if (pic_in_kernel(v->kvm)) return kvm_pic_read_irq(v->kvm); /* PIC */ +#endif WARN_ON_ONCE(!irqchip_split(v->kvm)); return get_userspace_extint(v); @@ -171,7 +175,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); +#ifdef CONFIG_KVM_IOAPIC __kvm_migrate_pit_timer(vcpu); +#endif kvm_x86_call(migrate_timers)(vcpu); } @@ -187,6 +193,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } +#ifdef CONFIG_KVM_IOAPIC #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } @@ -273,3 +280,4 @@ int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) kvm_pic_update_irq(pic); return r; } +#endif diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7b8b54462f95..5e62c1f79ce6 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -18,6 +18,8 @@ #include #include "lapic.h" +#ifdef CONFIG_KVM_IOAPIC + #define PIC_NUM_PINS 16 #define SELECT_PIC(irq) \ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) @@ -79,12 +81,19 @@ static inline int irqchip_full(struct kvm *kvm) smp_rmb(); return mode == KVM_IRQCHIP_KERNEL; } +#else /* CONFIG_KVM_IOAPIC */ +static __always_inline int irqchip_full(struct kvm *kvm) +{ + return false; +} +#endif static inline int pic_in_kernel(struct kvm *kvm) { return irqchip_full(kvm); } + static inline int irqchip_split(struct kvm *kvm) { int mode = kvm->arch.irqchip_mode; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 13d84c25e503..14fc8db0206c 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -208,6 +208,7 @@ int kvm_set_routing_entry(struct kvm *kvm, * check kvm_arch_can_set_irq_routing() before calling this function. */ switch (ue->type) { +#ifdef CONFIG_KVM_IOAPIC case KVM_IRQ_ROUTING_IRQCHIP: if (irqchip_split(kvm)) return -EINVAL; @@ -231,6 +232,7 @@ int kvm_set_routing_entry(struct kvm *kvm, } e->irqchip.irqchip = ue->u.irqchip.irqchip; break; +#endif case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; e->msi.address_lo = ue->u.msi.address_lo; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73418dc0ebb2..4cf8c1f753d3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - int trigger_mode; + int __maybe_unused trigger_mode; /* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) @@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) return; } +#ifdef CONFIG_KVM_IOAPIC if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); +#endif } static int apic_set_eoi(struct kvm_lapic *apic) @@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); + +#ifdef CONFIG_KVM_IOAPIC if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); +#endif vcpu->arch.apic_arb_prio = 0; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4ef17990574d..ababdba2c186 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -270,6 +270,7 @@ TRACE_EVENT(kvm_cpuid, {0x6, "SIPI"}, \ {0x7, "ExtINT"} +#ifdef CONFIG_KVM_IOAPIC TRACE_EVENT(kvm_ioapic_set_irq, TP_PROTO(__u64 e, int pin, bool coalesced), TP_ARGS(e, pin, coalesced), @@ -314,6 +315,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, (__entry->e & (1<<15)) ? "level" : "edge", (__entry->e & (1<<16)) ? "|masked" : "") ); +#endif TRACE_EVENT(kvm_msi_set_irq, TP_PROTO(__u64 address, __u64 data), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1d744730985e..78dfa6c1cb01 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4632,17 +4632,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: +#ifdef CONFIG_KVM_IOAPIC case KVM_CAP_PIT: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_REINJECT_CONTROL: +#endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: #ifdef CONFIG_KVM_HYPERV @@ -6937,9 +6940,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; + +#ifdef CONFIG_KVM_IOAPIC /* * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be + * that these three variables' stack usage should be * combined, not added together. */ union { @@ -6947,6 +6952,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; +#endif switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -6970,6 +6976,7 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; +#ifdef CONFIG_KVM_IOAPIC case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7136,6 +7143,7 @@ set_pit2_out: r = kvm_vm_ioctl_reinject(kvm, &control); break; } +#endif case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); @@ -10595,8 +10603,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); +#ifdef CONFIG_KVM_IOAPIC else if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); +#endif if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -12799,7 +12809,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); +#ifdef CONFIG_KVM_IOAPIC kvm_free_pit(kvm); +#endif kvm_mmu_pre_destroy_vm(kvm); static_call_cond(kvm_x86_vm_pre_destroy)(kvm); @@ -12823,8 +12835,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); +#ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); +#endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4ff5ea29e343..3b5575d0b574 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1023,7 +1023,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_KVM_IOAPIC void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); #else static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 96e581900c8e..1065a81ca57f 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -84,14 +84,14 @@ TRACE_EVENT(kvm_set_irq, ); #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ -#if defined(__KVM_HAVE_IOAPIC) +#ifdef CONFIG_KVM_IOAPIC #define kvm_irqchips \ {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} -#endif /* defined(__KVM_HAVE_IOAPIC) */ +#endif /* CONFIG_KVM_IOAPIC */ #if defined(CONFIG_HAVE_KVM_IRQCHIP) -- cgit v1.2.3 From cb210737675ef4c1ad88721e84558eeb2f199312 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:06 -0700 Subject: KVM: Pass new routing entries and irqfd when updating IRTEs When updating IRTEs in response to a GSI routing or IRQ bypass change, pass the new/current routing information along with the associated irqfd. This will allow KVM x86 to harden, simplify, and deduplicate its code. Since adding/removing a bypass producer is now conveniently protected with irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(), use the routing information cached in the irqfd instead of looking up the information in the current GSI routing tables. Opportunistically convert an existing printk() to pr_info() and put its string onto a single line (old code that strictly adhered to 80 chars). Link: https://lore.kernel.org/r/20250611224604.313496-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 7 ++++--- arch/x86/include/asm/kvm_host.h | 6 ++++-- arch/x86/kvm/svm/avic.c | 18 +++++++----------- arch/x86/kvm/svm/svm.h | 5 +++-- arch/x86/kvm/vmx/posted_intr.c | 19 ++++++++----------- arch/x86/kvm/vmx/posted_intr.h | 8 ++++++-- arch/x86/kvm/x86.c | 36 ++++++++++++++++++++---------------- include/linux/kvm_host.h | 7 +++++-- virt/kvm/eventfd.c | 11 +++++------ 9 files changed, 62 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 38a91bb5d4c7..a9a39e0375f7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2771,8 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, return memcmp(&old->msi, &new->msi, sizeof(new->msi)); } -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { /* * Remapping the vLPI requires taking the its_lock mutex to resolve @@ -2781,7 +2782,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, * * Unmap the vLPI and fall back to software LPI injection. */ - return kvm_vgic_v4_unset_forwarding(kvm, host_irq); + return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq); } void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44fd9ccc0624..2a14564dd08a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -297,6 +297,7 @@ enum x86_intercept_stage; */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irqfd; struct kvm_kernel_irq_routing_entry; /* @@ -1845,8 +1846,9 @@ struct kvm_x86_ops { void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); - int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); + int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 067f8e3f5a0d..49b73907de92 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -886,21 +887,14 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, return 0; } -/* - * avic_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; bool enable_remapped_mode = true; + bool set = !!new; int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) @@ -926,6 +920,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, if (e->type != KVM_IRQ_ROUTING_MSI) continue; + WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e6f3c6a153a0..b35fce30d923 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -736,8 +736,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5c615e5845bf..110fb19848ab 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -2,6 +2,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include @@ -294,17 +295,9 @@ void vmx_pi_start_assignment(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); } -/* - * vmx_pi_update_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; @@ -312,6 +305,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; + bool set = !!new; int idx, ret = 0; if (!vmx_can_use_vtd_pi(kvm)) @@ -329,6 +323,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) continue; + + WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + /* * VT-d PI cannot support posting multicast/broadcast * interrupts to a vCPU, we still use interrupt remapping diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 80499ea0e674..a94afcb55f7f 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -3,6 +3,9 @@ #define __KVM_X86_VMX_POSTED_INTR_H #include +#include +#include + #include void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); @@ -11,8 +14,9 @@ void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); -int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, + unsigned int host_irq, uint32_t guest_irq, + struct kvm_kernel_irq_routing_entry *new); void vmx_pi_start_assignment(struct kvm *kvm); static inline int pi_find_highest_vector(struct pi_desc *pi_desc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02237dbb7f32..e1304b002062 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13507,31 +13507,31 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm *kvm = irqfd->kvm; - int ret; + int ret = 0; kvm_arch_start_assignment(irqfd->kvm); spin_lock_irq(&kvm->irqfds.lock); irqfd->producer = prod; - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - if (ret) - kvm_arch_end_assignment(irqfd->kvm); - + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, &irqfd->irq_entry); + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + } spin_unlock_irq(&kvm->irqfds.lock); - return ret; } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { - int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm *kvm = irqfd->kvm; + int ret; WARN_ON(irqfd->producer != prod); @@ -13544,11 +13544,13 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, spin_lock_irq(&kvm->irqfds.lock); irqfd->producer = NULL; - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration" - " fails: %d\n", irqfd->consumer.eventfd, ret); + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq, + irqfd->gsi, NULL); + if (ret) + pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", + irqfd->consumer.eventfd, ret); + } spin_unlock_irq(&kvm->irqfds.lock); @@ -13556,10 +13558,12 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_arch_end_assignment(irqfd->kvm); } -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); + return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq, + irqfd->gsi, new); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b5575d0b574..a4160c1c0c6b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2401,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) +struct kvm_kernel_irqfd; + bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); @@ -2408,8 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); +int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 5bc6abe30748..bd1766da6895 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __attribute__((weak)) kvm_arch_update_irqfd_routing( - struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { return 0; } @@ -618,9 +618,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing( - irqfd->kvm, irqfd->producer->irq, - irqfd->gsi, 1); + int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); + WARN_ON(ret); } #endif -- cgit v1.2.3 From 05c5e23657e1d61c271c2f4a3a21d4d630b18a9b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:07 -0700 Subject: KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure Track the IRTEs that are posting to an SVM vCPU via the associated irqfd structure and GSI routing instead of dynamically allocating a separate data structure. In addition to eliminating an atomic allocation, this will allow hoisting much of the IRTE update logic to common x86. Cc: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 71 ++++++++++++++++++----------------------------- arch/x86/kvm/svm/svm.h | 10 ++++--- include/linux/kvm_irqfd.h | 3 ++ 3 files changed, 36 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 49b73907de92..accc36958a75 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -76,14 +76,6 @@ static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); bool x2avic_enabled; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; - static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -747,8 +739,8 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; - struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; if (!kvm_arch_has_assigned_device(vcpu->kvm)) return 0; @@ -762,11 +754,11 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) if (list_empty(&svm->ir_list)) goto out; - list_for_each_entry(ir, &svm->ir_list, node) { + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); + ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data); else - ret = amd_iommu_deactivate_guest_mode(ir->data); + ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data); if (ret) break; } @@ -775,27 +767,30 @@ out: return ret; } -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +static void svm_ir_list_del(struct vcpu_svm *svm, + struct kvm_kernel_irqfd *irqfd, + struct amd_iommu_pi_data *pi) { unsigned long flags; - struct amd_svm_iommu_ir *cur; + struct kvm_kernel_irqfd *cur; spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) + list_for_each_entry(cur, &svm->ir_list, vcpu_list) { + if (cur->irq_bypass_data != pi->ir_data) + continue; + if (WARN_ON_ONCE(cur != irqfd)) continue; - list_del(&cur->node); - kfree(cur); + list_del(&irqfd->vcpu_list); break; } spin_unlock_irqrestore(&svm->ir_list_lock, flags); } -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +static int svm_ir_list_add(struct vcpu_svm *svm, + struct kvm_kernel_irqfd *irqfd, + struct amd_iommu_pi_data *pi) { - int ret = 0; unsigned long flags; - struct amd_svm_iommu_ir *ir; u64 entry; if (WARN_ON_ONCE(!pi->ir_data)) @@ -812,25 +807,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); struct vcpu_svm *prev_svm; - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } + if (!prev_vcpu) + return -EINVAL; prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); + svm_ir_list_del(prev_svm, irqfd, pi); } - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; + irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -845,10 +829,9 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, true, pi->ir_data); - list_add(&ir->node, &svm->ir_list); + list_add(&irqfd->vcpu_list, &svm->ir_list); spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; + return 0; } /* @@ -952,7 +935,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * scheduling information in IOMMU irte. */ if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); + svm_ir_list_add(svm, irqfd, &pi); } if (!ret && svm) { @@ -993,7 +976,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, vcpu = kvm_get_vcpu_by_id(kvm, id); if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); + svm_ir_list_del(to_svm(vcpu), irqfd, &pi); } } out: @@ -1005,8 +988,8 @@ static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { int ret = 0; - struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_kernel_irqfd *irqfd; lockdep_assert_held(&svm->ir_list_lock); @@ -1020,8 +1003,8 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) if (list_empty(&svm->ir_list)) return 0; - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { + ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data); if (ret) return ret; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b35fce30d923..cc27877d69ae 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -310,10 +310,12 @@ struct vcpu_svm { u64 *avic_physical_id_cache; /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. + * Per-vCPU list of irqfds that are eligible to post IRQs directly to + * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list + * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the + * target pCPU), when AVIC is toggled on/off (to (de)activate bypass), + * and if the irqfd becomes ineligible for posting (to put the IRTE + * back into remapped mode). */ struct list_head ir_list; spinlock_t ir_list_lock; diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 8ad43692e3bb..6510a48e62aa 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -59,6 +59,9 @@ struct kvm_kernel_irqfd { struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + + struct list_head vcpu_list; + void *irq_bypass_data; }; #endif /* __LINUX_KVM_IRQFD_H */ -- cgit v1.2.3 From 0a917e9d4b7070fafcbf7a8ec32d2aa444b4e757 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:08 -0700 Subject: KVM: SVM: Delete IRTE link from previous vCPU before setting new IRTE Delete the previous per-vCPU IRTE link prior to modifying the IRTE. If forcing the IRTE back to remapped mode fails, the IRQ is already broken; keeping stale metadata won't change that, and the IOMMU should be sufficiently paranoid to sanitize the IRTE when the IRQ is freed and reallocated. This will allow hoisting the vCPU tracking to common x86, which in turn will allow most of the IRTE update code to be deduplicated. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 60 ++++++++++------------------------------------- include/linux/kvm_irqfd.h | 1 + 2 files changed, 14 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index accc36958a75..7f7af8ff627d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -767,23 +767,19 @@ out: return ret; } -static void svm_ir_list_del(struct vcpu_svm *svm, - struct kvm_kernel_irqfd *irqfd, - struct amd_iommu_pi_data *pi) +static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) { + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; unsigned long flags; - struct kvm_kernel_irqfd *cur; - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, vcpu_list) { - if (cur->irq_bypass_data != pi->ir_data) - continue; - if (WARN_ON_ONCE(cur != irqfd)) - continue; - list_del(&irqfd->vcpu_list); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + if (!vcpu) + return; + + spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + list_del(&irqfd->vcpu_list); + spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); + + irqfd->irq_bypass_vcpu = NULL; } static int svm_ir_list_add(struct vcpu_svm *svm, @@ -796,24 +792,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, if (WARN_ON_ONCE(!pi->ir_data)) return -EINVAL; - /** - * In some cases, the existing irte is updated and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->prev_ga_tag) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) - return -EINVAL; - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, irqfd, pi); - } - + irqfd->irq_bypass_vcpu = &svm->vcpu; irqfd->irq_bypass_data = pi->ir_data; spin_lock_irqsave(&svm->ir_list_lock, flags); @@ -905,6 +884,8 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new))); + svm_ir_list_del(irqfd); + /** * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. @@ -963,21 +944,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), irqfd, &pi); - } } out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 6510a48e62aa..361c07f4466d 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -60,6 +60,7 @@ struct kvm_kernel_irqfd { struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; + struct kvm_vcpu *irq_bypass_vcpu; struct list_head vcpu_list; void *irq_bypass_data; }; -- cgit v1.2.3 From 1da19c5ce0533796179d9e1b55e64bf78478c4c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:09 -0700 Subject: iommu/amd: KVM: SVM: Delete now-unused cached/previous GA tag fields Delete the amd_ir_data.prev_ga_tag field now that all usage is superfluous. Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 -- drivers/iommu/amd/amd_iommu_types.h | 1 - drivers/iommu/amd/iommu.c | 10 ---------- include/linux/amd-iommu.h | 2 +- 4 files changed, 1 insertion(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 7f7af8ff627d..38cdfb052a3a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -939,9 +939,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, /** * Here, pi is used to: * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. */ - pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); } diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index ccbab3a4811a..053a0f05768c 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -1054,7 +1054,6 @@ struct irq_2_irte { }; struct amd_ir_data { - u32 cached_ga_tag; struct amd_iommu *iommu; struct irq_2_irte irq_2_irte; struct msi_msg msi_entry; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3117d99cf83d..ed96050d4933 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3887,23 +3887,13 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) ir_data->cfg = irqd_cfg(data); pi_data->ir_data = ir_data; - pi_data->prev_ga_tag = ir_data->cached_ga_tag; if (pi_data->is_guest_mode) { ir_data->ga_root_ptr = (pi_data->base >> 12); ir_data->ga_vector = vcpu_pi_info->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); - if (!ret) - ir_data->cached_ga_tag = pi_data->ga_tag; } else { ret = amd_iommu_deactivate_guest_mode(ir_data); - - /* - * This communicates the ga_tag back to the caller - * so that it can do all the necessary clean up. - */ - if (!ret) - ir_data->cached_ga_tag = 0; } return ret; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 062fbd4c9b77..1f9b13d803c5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,8 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u32 prev_ga_tag; u64 base; + bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; -- cgit v1.2.3 From c4cdbaf9d81c8387da8b9b91567d4b0eb1b8a549 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:24 -0700 Subject: iommu/amd: KVM: SVM: Use pi_desc_addr to derive ga_root_ptr Use vcpu_data.pi_desc_addr instead of amd_iommu_pi_data.base to get the GA root pointer. KVM is the only source of amd_iommu_pi_data.base, and KVM's one and only path for writing amd_iommu_pi_data.base computes the exact same value for vcpu_data.pi_desc_addr and amd_iommu_pi_data.base, and fills amd_iommu_pi_data.base if and only if vcpu_data.pi_desc_addr is valid, i.e. amd_iommu_pi_data.base is fully redundant. Cc: Maxim Levitsky Reviewed-by: Joao Martins Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 7 +++++-- drivers/iommu/amd/iommu.c | 2 +- include/linux/amd-iommu.h | 2 -- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 375a29022000..d95742faae11 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -894,8 +894,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, enable_remapped_mode = false; - /* Try to enable guest_mode in IRTE */ - pi.base = avic_get_backing_page_address(svm); + /* + * Try to enable guest_mode in IRTE. Note, the address + * of the vCPU's AVIC backing page is passed to the + * IOMMU via vcpu_info->pi_desc_addr. + */ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index ed96050d4933..bdfb57af7111 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3888,7 +3888,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) pi_data->ir_data = ir_data; if (pi_data->is_guest_mode) { - ir_data->ga_root_ptr = (pi_data->base >> 12); + ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12); ir_data->ga_vector = vcpu_pi_info->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 1f9b13d803c5..deeefc92a5cf 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -19,8 +19,6 @@ struct amd_iommu; */ struct amd_iommu_pi_data { u32 ga_tag; - u64 base; - bool is_guest_mode; struct vcpu_data *vcpu_data; void *ir_data; -- cgit v1.2.3 From 53527ea1b70224d16d29edbd5c850456469f00ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:34 -0700 Subject: iommu: KVM: Split "struct vcpu_data" into separate AMD vs. Intel structs Split the vcpu_data structure that serves as a handoff from KVM to IOMMU drivers into vendor specific structures. Overloading a single structure makes the code hard to read and maintain, is *very* misleading as it suggests that mixing vendors is actually supported, and bastardizing Intel's posted interrupt descriptor address when AMD's IOMMU already has its own structure is quite unnecessary. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 15 ++++++++++++++- arch/x86/kvm/svm/avic.c | 21 ++++++++------------- arch/x86/kvm/vmx/posted_intr.c | 4 ++-- drivers/iommu/amd/iommu.c | 12 ++++-------- drivers/iommu/intel/irq_remapping.c | 10 +++++----- include/linux/amd-iommu.h | 12 ------------ 6 files changed, 33 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 5036f13ab69f..2dbc9cb61c2f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,7 +26,20 @@ enum { IRQ_REMAP_X2APIC_MODE, }; -struct vcpu_data { +/* + * This is mainly used to communicate information back-and-forth + * between SVM and IOMMU for setting up and tearing down posted + * interrupt + */ +struct amd_iommu_pi_data { + u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ + u32 ga_tag; + u32 vector; /* Guest vector of the interrupt */ + bool is_guest_mode; + void *ir_data; +}; + +struct intel_iommu_pi_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index eed5c58ac07f..dc1526fef18d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -823,23 +823,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ if (vcpu && kvm_vcpu_apicv_active(vcpu)) { /* - * Try to enable guest_mode in IRTE. Note, the address - * of the vCPU's AVIC backing page is passed to the - * IOMMU via vcpu_info->pi_desc_addr. + * Try to enable guest_mode in IRTE. */ - struct vcpu_data vcpu_info = { - .pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)), - .vector = vector, - }; - - struct amd_iommu_pi_data pi = { - .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id), + struct amd_iommu_pi_data pi_data = { + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + vcpu->vcpu_id), .is_guest_mode = true, - .vcpu_data = &vcpu_info, + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), + .vector = vector, }; int ret; - ret = irq_set_vcpu_affinity(host_irq, &pi); + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) return ret; @@ -850,7 +845,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * we can reference to them directly when we update vcpu * scheduling information in IOMMU irte. */ - return svm_ir_list_add(to_svm(vcpu), irqfd, &pi); + return svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data); } return irq_set_vcpu_affinity(host_irq, NULL); } diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 687ffde3b61c..3a23c30f73cb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -303,12 +303,12 @@ int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, struct kvm_vcpu *vcpu, u32 vector) { if (vcpu) { - struct vcpu_data vcpu_info = { + struct intel_iommu_pi_data pi_data = { .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), .vector = vector, }; - return irq_set_vcpu_affinity(host_irq, &vcpu_info); + return irq_set_vcpu_affinity(host_irq, &pi_data); } else { return irq_set_vcpu_affinity(host_irq, NULL); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e8e259c07265..8366d32252cd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3860,10 +3860,10 @@ int amd_iommu_deactivate_guest_mode(void *data) } EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); -static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) +static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) { int ret; - struct amd_iommu_pi_data *pi_data = vcpu_info; + struct amd_iommu_pi_data *pi_data = info; struct amd_ir_data *ir_data = data->chip_data; struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct iommu_dev_data *dev_data; @@ -3886,14 +3886,10 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) ir_data->cfg = irqd_cfg(data); if (pi_data) { - struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; - pi_data->ir_data = ir_data; - WARN_ON_ONCE(!pi_data->is_guest_mode); - - ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12); - ir_data->ga_vector = vcpu_pi_info->vector; + ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); + ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; ret = amd_iommu_activate_guest_mode(ir_data); } else { diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index cf7b6882ec75..2fc451253dc3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data, static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) { struct intel_ir_data *ir_data = data->chip_data; - struct vcpu_data *vcpu_pi_info = info; + struct intel_iommu_pi_data *pi_data = info; /* stop posting interrupts, back to the default mode */ - if (!vcpu_pi_info) { + if (!pi_data) { __intel_ir_reconfigure_irte(data, true); } else { struct irte irte_pi; @@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) /* Update the posted mode fields */ irte_pi.p_pst = 1; irte_pi.p_urgent = 0; - irte_pi.p_vector = vcpu_pi_info->vector; - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + irte_pi.p_vector = pi_data->vector; + irte_pi.pda_l = (pi_data->pi_desc_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & ~(-1UL << PDA_HIGH_BIT); ir_data->irq_2_iommu.posted_vcpu = true; diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index deeefc92a5cf..99b4fa9a0296 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -12,18 +12,6 @@ struct amd_iommu; -/* - * This is mainly used to communicate information back-and-forth - * between SVM and IOMMU for setting up and tearing down posted - * interrupt - */ -struct amd_iommu_pi_data { - u32 ga_tag; - bool is_guest_mode; - struct vcpu_data *vcpu_data; - void *ir_data; -}; - #ifdef CONFIG_AMD_IOMMU struct task_struct; -- cgit v1.2.3 From b33252b9d17238a4a9fa5a29af6e6a2922a6c2b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:35 -0700 Subject: KVM: Don't WARN if updating IRQ bypass route fails Don't bother WARNing if updating an IRTE route fails now that vendor code provides much more precise WARNs. The generic WARN doesn't provide enough information to actually debug the problem, and has obviously done nothing to surface the myriad bugs in KVM x86's implementation. Drop all of the associated return code plumbing that existed just so that common KVM could WARN. Link: https://lore.kernel.org/r/20250611224604.313496-34-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 +++--- arch/arm64/kvm/vgic/vgic-v4.c | 8 +++----- arch/x86/kvm/irq.c | 8 ++++---- include/kvm/arm_vgic.h | 2 +- include/linux/kvm_host.h | 6 +++--- virt/kvm/eventfd.c | 15 ++++++--------- 6 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a9a39e0375f7..94fb2f096a20 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2771,9 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, return memcmp(&old->msi, &new->msi, sizeof(new->msi)); } -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { /* * Remapping the vLPI requires taking the its_lock mutex to resolve diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 911170d4a9c8..ef3481963122 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -527,18 +527,17 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq) return NULL; } -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) { struct vgic_irq *irq; unsigned long flags; - int ret = 0; if (!vgic_supports_direct_msis(kvm)) - return 0; + return; irq = __vgic_host_irq_get_vlpi(kvm, host_irq); if (!irq) - return 0; + return; raw_spin_lock_irqsave(&irq->irq_lock, flags); WARN_ON(irq->hw && irq->host_irq != host_irq); @@ -550,5 +549,4 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); - return 0; } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index cd9b56c6a5c3..b7b0fbc218b8 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -606,11 +606,11 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_arch_end_assignment(irqfd->kvm); } -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return kvm_pi_update_irte(irqfd, new); + kvm_pi_update_irte(irqfd, new); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4a34f7f0a864..b2a04481de1a 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry; int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); -int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); +void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a4160c1c0c6b..8e74ac0f90b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2410,9 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *, struct irq_bypass_producer *); void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); -int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new); +void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new); bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index bd1766da6895..719b242fc935 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -285,11 +285,11 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start( { } -int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) +void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, + struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) { - return 0; + } bool __attribute__((weak)) kvm_arch_irqfd_route_changed( @@ -617,11 +617,8 @@ void kvm_irq_routing_update(struct kvm *kvm) #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { - int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); - - WARN_ON(ret); - } + kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } -- cgit v1.2.3 From 77bb184ab880171a1cedfbed9ab05977e6ae2258 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:36 -0700 Subject: KVM: Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing() Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing(). Calling arch code to know whether or not to call arch code is absurd. Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-35-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 15 +++++---------- arch/x86/kvm/irq.c | 15 +++++---------- include/linux/kvm_host.h | 2 -- virt/kvm/eventfd.c | 10 +--------- 4 files changed, 11 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 94fb2f096a20..04f2116927b1 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2761,20 +2761,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); } -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; + /* * Remapping the vLPI requires taking the its_lock mutex to resolve * the new translation. We're in spinlock land at this point, so no diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index b7b0fbc218b8..23e0acc07cb6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -610,17 +610,12 @@ void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - kvm_pi_update_irte(irqfd, new); -} - -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (old->type != KVM_IRQ_ROUTING_MSI || - new->type != KVM_IRQ_ROUTING_MSI) - return true; + if (old->type == KVM_IRQ_ROUTING_MSI && + new->type == KVM_IRQ_ROUTING_MSI && + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) + return; - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); + kvm_pi_update_irte(irqfd, new); } #ifdef CONFIG_KVM_IOAPIC diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8e74ac0f90b1..fb9ec06aa807 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2413,8 +2413,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new); -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, - struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 719b242fc935..59b1e64697f1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -291,13 +291,6 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, { } - -bool __attribute__((weak)) kvm_arch_irqfd_route_changed( - struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - return true; -} #endif static int @@ -616,8 +609,7 @@ void kvm_irq_routing_update(struct kvm *kvm) irqfd_update(kvm, irqfd); #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) - if (irqfd->producer && - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) + if (irqfd->producer) kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); #endif } -- cgit v1.2.3 From 08d9ccdd1a5c75d7aca7ac3af56f723d780dd6ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:43 -0700 Subject: iommu/amd: KVM: SVM: Infer IsRun from validity of pCPU destination Infer whether or not a vCPU should be marked running from the validity of the pCPU on which it is running. amd_iommu_update_ga() already skips the IRTE update if the pCPU is invalid, i.e. passing %true for is_run with an invalid pCPU would be a blatant and egregrious KVM bug. Tested-by: Sairaj Kodilkar Link: https://lore.kernel.org/r/20250611224604.313496-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 11 +++++------ drivers/iommu/amd/iommu.c | 14 +++++++++----- include/linux/amd-iommu.h | 6 ++---- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 42fd1868c32f..1960bb06c4b9 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -833,7 +833,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, entry = svm->avic_physical_id_entry; if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - true, pi_data.ir_data); + pi_data.ir_data); irqfd->irq_bypass_data = pi_data.ir_data; list_add(&irqfd->vcpu_list, &svm->ir_list); @@ -842,8 +842,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return irq_set_vcpu_affinity(host_irq, NULL); } -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) +static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu) { int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); @@ -862,7 +861,7 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) return 0; list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { - ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data); + ret = amd_iommu_update_ga(cpu, irqfd->irq_bypass_data); if (ret) return ret; } @@ -924,7 +923,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id); spin_unlock_irqrestore(&svm->ir_list_lock, flags); } @@ -964,7 +963,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ spin_lock_irqsave(&svm->ir_list_lock, flags); - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + avic_update_iommu_vcpu_affinity(vcpu, -1); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; svm->avic_physical_id_entry = entry; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 0a0d73483195..3c4c81eb201b 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3990,15 +3990,17 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) * Update the pCPU information for an IRTE that is configured to post IRQs to * a vCPU, without issuing an IOMMU invalidation for the IRTE. * - * This API is intended to be used when a vCPU is scheduled in/out (or stops - * running for any reason), to do a fast update of IsRun and (conditionally) - * Destination. + * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination + * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs + * that are associated with a pCPU as running. This API is intended to be used + * when a vCPU is scheduled in/out (or stops running for any reason), to do a + * fast update of IsRun and (conditionally) Destination. * * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached * and thus don't require an invalidation to ensure the IOMMU consumes fresh * information. */ -int amd_iommu_update_ga(int cpu, bool is_run, void *data) +int amd_iommu_update_ga(int cpu, void *data) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -4015,8 +4017,10 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) APICID_TO_IRTE_DEST_LO(cpu); entry->hi.fields.destination = APICID_TO_IRTE_DEST_HI(cpu); + entry->lo.fields_vapic.is_run = true; + } else { + entry->lo.fields_vapic.is_run = false; } - entry->lo.fields_vapic.is_run = is_run; return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 99b4fa9a0296..fe0e16ffe0e5 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,8 +30,7 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int -amd_iommu_update_ga(int cpu, bool is_run, void *data); +extern int amd_iommu_update_ga(int cpu, void *data); extern int amd_iommu_activate_guest_mode(void *data); extern int amd_iommu_deactivate_guest_mode(void *data); @@ -44,8 +43,7 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int -amd_iommu_update_ga(int cpu, bool is_run, void *data) +static inline int amd_iommu_update_ga(int cpu, void *data) { return 0; } -- cgit v1.2.3 From f965255dc5033387ac7858787c6c792fd789ac34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:45 -0700 Subject: iommu/amd: KVM: SVM: Set pCPU info in IRTE when setting vCPU affinity Now that setting vCPU affinity is guarded with ir_list_lock, i.e. now that avic_physical_id_entry can be safely accessed, set the pCPU info straight-away when setting vCPU affinity. Putting the IRTE into posted mode, and then immediately updating the IRTE a second time if the target vCPU is running is wasteful and confusing. This also fixes a flaw where a posted IRQ that arrives between putting the IRTE into guest_mode and setting the correct destination could cause the IOMMU to ring the doorbell on the wrong pCPU. Link: https://lore.kernel.org/r/20250611224604.313496-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 1 + arch/x86/kvm/svm/avic.c | 26 ++++++++++++++------------ drivers/iommu/amd/iommu.c | 6 ++++-- include/linux/amd-iommu.h | 4 ++-- 4 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 2dbc9cb61c2f..4c75a17632f6 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -35,6 +35,7 @@ struct amd_iommu_pi_data { u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ u32 ga_tag; u32 vector; /* Guest vector of the interrupt */ + int cpu; bool is_guest_mode; void *ir_data; }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1960bb06c4b9..35c5fb831c28 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -727,6 +727,7 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { + int apic_id = kvm_cpu_get_apicid(vcpu->cpu); int ret = 0; unsigned long flags; struct vcpu_svm *svm = to_svm(vcpu); @@ -746,7 +747,7 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { if (activate) - ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data); + ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data, apic_id); else ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data); if (ret) @@ -810,6 +811,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, */ guard(spinlock_irqsave)(&svm->ir_list_lock); + /* + * Update the target pCPU for IOMMU doorbells if the vCPU is + * running. If the vCPU is NOT running, i.e. is blocking or + * scheduled out, KVM will update the pCPU info when the vCPU + * is awakened and/or scheduled in. See also avic_vcpu_load(). + */ + entry = svm->avic_physical_id_entry; + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + else + pi_data.cpu = -1; + ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) return ret; @@ -824,17 +837,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, return -EIO; } - /* - * Update the target pCPU for IOMMU doorbells if the vCPU is - * running. If the vCPU is NOT running, i.e. is blocking or - * scheduled out, KVM will update the pCPU info when the vCPU - * is awakened and/or scheduled in. See also avic_vcpu_load(). - */ - entry = svm->avic_physical_id_entry; - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, - pi_data.ir_data); - irqfd->irq_bypass_data = pi_data.ir_data; list_add(&irqfd->vcpu_list, &svm->ir_list); return 0; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2eaba64be68a..6352930ad011 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3850,7 +3850,7 @@ int amd_iommu_update_ga(int cpu, void *data) } EXPORT_SYMBOL(amd_iommu_update_ga); -int amd_iommu_activate_guest_mode(void *data) +int amd_iommu_activate_guest_mode(void *data, int cpu) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3871,6 +3871,8 @@ int amd_iommu_activate_guest_mode(void *data) entry->hi.fields.vector = ir_data->ga_vector; entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; + __amd_iommu_update_ga(entry, cpu); + return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); } @@ -3937,7 +3939,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; - ret = amd_iommu_activate_guest_mode(ir_data); + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); } else { ret = amd_iommu_deactivate_guest_mode(ir_data); } diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index fe0e16ffe0e5..c9f2df0c4596 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -32,7 +32,7 @@ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); extern int amd_iommu_update_ga(int cpu, void *data); -extern int amd_iommu_activate_guest_mode(void *data); +extern int amd_iommu_activate_guest_mode(void *data, int cpu); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -48,7 +48,7 @@ static inline int amd_iommu_update_ga(int cpu, void *data) return 0; } -static inline int amd_iommu_activate_guest_mode(void *data) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu) { return 0; } -- cgit v1.2.3 From b9e53f9ff4a88f01b22524878c9a381a6c5f65ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:46:03 -0700 Subject: iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts Add plumbing to the AMD IOMMU driver to allow KVM to control whether or not an IRTE is configured to generate GA log interrupts. KVM only needs a notification if the target vCPU is blocking, so the vCPU can be awakened. If a vCPU is preempted or exits to userspace, KVM clears is_run, but will set the vCPU back to running when userspace does KVM_RUN and/or the vCPU task is scheduled back in, i.e. KVM doesn't need a notification. Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes from the KVM changes insofar as possible. Opportunistically swap the ordering of parameters for amd_iommu_update_ga() so that the match amd_iommu_activate_guest_mode(). Note, as of this writing, the AMD IOMMU manual doesn't list GALogIntr as a non-cached field, but per AMD hardware architects, it's not cached and can be safely updated without an invalidation. Link: https://lore.kernel.org/all/b29b8c22-2fd4-4b5e-b755-9198874157c7@amd.com Cc: Vasant Hegde Cc: Joao Martins Link: https://lore.kernel.org/r/20250611224604.313496-62-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/irq_remapping.h | 1 + arch/x86/kvm/svm/avic.c | 10 ++++++---- drivers/iommu/amd/iommu.c | 28 +++++++++++++++++----------- include/linux/amd-iommu.h | 9 ++++----- 4 files changed, 28 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 4c75a17632f6..5a0d42464d44 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -36,6 +36,7 @@ struct amd_iommu_pi_data { u32 ga_tag; u32 vector; /* Guest vector of the interrupt */ int cpu; + bool ga_log_intr; bool is_guest_mode; void *ir_data; }; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5803f778999f..02f266901bfe 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -785,10 +785,12 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * is awakened and/or scheduled in. See also avic_vcpu_load(). */ entry = svm->avic_physical_id_entry; - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; - else + } else { pi_data.cpu = -1; + pi_data.ga_log_intr = true; + } ret = irq_set_vcpu_affinity(host_irq, &pi_data); if (ret) @@ -850,9 +852,9 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, void *data = irqfd->irq_bypass_data; if (!(action & AVIC_TOGGLE_ON_OFF)) - WARN_ON_ONCE(amd_iommu_update_ga(cpu, data)); + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, true)); else if (cpu >= 0) - WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu)); + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, true)); else WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 4b0cc249771f..c50d4a8a51be 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3804,7 +3804,8 @@ static const struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; -static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) +static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, + bool ga_log_intr) { if (cpu >= 0) { entry->lo.fields_vapic.destination = @@ -3812,8 +3813,10 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) entry->hi.fields.destination = APICID_TO_IRTE_DEST_HI(cpu); entry->lo.fields_vapic.is_run = true; + entry->lo.fields_vapic.ga_log_intr = false; } else { entry->lo.fields_vapic.is_run = false; + entry->lo.fields_vapic.ga_log_intr = ga_log_intr; } } @@ -3822,16 +3825,19 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu) * a vCPU, without issuing an IOMMU invalidation for the IRTE. * * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination - * with the pCPU's APIC ID and set IsRun, else clear IsRun. I.e. treat vCPUs - * that are associated with a pCPU as running. This API is intended to be used - * when a vCPU is scheduled in/out (or stops running for any reason), to do a - * fast update of IsRun and (conditionally) Destination. + * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't + * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based + * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is + * blocking and requires a notification wake event). I.e. treat vCPUs that are + * associated with a pCPU as running. This API is intended to be used when a + * vCPU is scheduled in/out (or stops running for any reason), to do a fast + * update of IsRun, GALogIntr, and (conditionally) Destination. * * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached * and thus don't require an invalidation to ensure the IOMMU consumes fresh * information. */ -int amd_iommu_update_ga(int cpu, void *data) +int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3845,14 +3851,14 @@ int amd_iommu_update_ga(int cpu, void *data) if (!ir_data->iommu) return -ENODEV; - __amd_iommu_update_ga(entry, cpu); + __amd_iommu_update_ga(entry, cpu, ga_log_intr); return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); } EXPORT_SYMBOL(amd_iommu_update_ga); -int amd_iommu_activate_guest_mode(void *data, int cpu) +int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { struct amd_ir_data *ir_data = (struct amd_ir_data *)data; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -3871,12 +3877,11 @@ int amd_iommu_activate_guest_mode(void *data, int cpu) entry->lo.fields_vapic.valid = valid; entry->lo.fields_vapic.guest_mode = 1; - entry->lo.fields_vapic.ga_log_intr = 1; entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; entry->hi.fields.vector = ir_data->ga_vector; entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; - __amd_iommu_update_ga(entry, cpu); + __amd_iommu_update_ga(entry, cpu, ga_log_intr); return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, ir_data->irq_2_irte.index, entry); @@ -3947,7 +3952,8 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) ir_data->ga_vector = pi_data->vector; ir_data->ga_tag = pi_data->ga_tag; if (pi_data->is_guest_mode) - ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu); + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, + pi_data->ga_log_intr); else ret = amd_iommu_deactivate_guest_mode(ir_data); } else { diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index c9f2df0c4596..8cced632ecd0 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -30,9 +30,8 @@ static inline void amd_iommu_detect(void) { } /* IOMMU AVIC Function */ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); -extern int amd_iommu_update_ga(int cpu, void *data); - -extern int amd_iommu_activate_guest_mode(void *data, int cpu); +extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr); +extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr); extern int amd_iommu_deactivate_guest_mode(void *data); #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ @@ -43,12 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) return 0; } -static inline int amd_iommu_update_ga(int cpu, void *data) +static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) { return 0; } -static inline int amd_iommu_activate_guest_mode(void *data, int cpu) +static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) { return 0; } -- cgit v1.2.3 From 283ed5001d6852f85c09ed2522331b2b197ba937 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:11 -0700 Subject: KVM: Use a local struct to do the initial vfs_poll() on an irqfd Use a function-local struct for the poll_table passed to vfs_poll(), as nothing in the vfs_poll() callchain grabs a long-term reference to the structure, i.e. its lifetime doesn't need to be tied to the irqfd. Using a local structure will also allow propagating failures out of the polling callback without further polluting kvm_kernel_irqfd. Opportunstically rename irqfd_ptable_queue_proc() to kvm_irqfd_register() to capture what it actually does. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_irqfd.h | 1 - virt/kvm/eventfd.c | 26 +++++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 361c07f4466d..ef8c134ded8a 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -55,7 +55,6 @@ struct kvm_kernel_irqfd { /* Used for setup/shutdown */ struct eventfd_ctx *eventfd; struct list_head list; - poll_table pt; struct work_struct shutdown; struct irq_bypass_consumer consumer; struct irq_bypass_producer *producer; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 59b1e64697f1..0b655376734e 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -245,12 +245,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) return ret; } -static void -irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, - poll_table *pt) +struct kvm_irqfd_pt { + struct kvm_kernel_irqfd *irqfd; + poll_table pt; +}; + +static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) { - struct kvm_kernel_irqfd *irqfd = - container_of(pt, struct kvm_kernel_irqfd, pt); + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); + struct kvm_kernel_irqfd *irqfd = p->irqfd; + add_wait_queue_priority(wqh, &irqfd->wait); } @@ -298,6 +303,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct kvm_irqfd_pt irqfd_pt; int ret; __poll_t events; int idx; @@ -387,7 +393,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) * a callback whenever someone signals the underlying eventfd */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); spin_lock_irq(&kvm->irqfds.lock); @@ -409,11 +414,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) spin_unlock_irq(&kvm->irqfds.lock); /* - * Check if there was an event already pending on the eventfd - * before we registered, and trigger it as if we didn't miss it. + * Register the irqfd with the eventfd by polling on the eventfd. If + * there was en event pending on the eventfd prior to registering, + * manually trigger IRQ injection. */ - events = vfs_poll(fd_file(f), &irqfd->pt); + irqfd_pt.irqfd = irqfd; + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); + events = vfs_poll(fd_file(f), &irqfd_pt.pt); if (events & EPOLLIN) schedule_work(&irqfd->inject); -- cgit v1.2.3 From 0d09582b3a607436fd91d6ce813048a048ecbf10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:18 -0700 Subject: sched/wait: Add a waitqueue helper for fully exclusive priority waiters Add a waitqueue helper to add a priority waiter that requires exclusive wakeups, i.e. that requires that it be the _only_ priority waiter. The API will be used by KVM to ensure that at most one of KVM's irqfds is bound to a single eventfd (across the entire kernel). Open code the helper instead of using __add_wait_queue() so that the common path doesn't need to "handle" impossible failures. Cc: K Prateek Nayak Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-9-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/wait.h | 2 ++ kernel/sched/wait.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 965a19809c7e..09855d819418 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4ab3ab195277..15632c89c4f2 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,24 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{ + struct list_head *head = &wq_head->head; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + + guard(spinlock_irqsave)(&wq_head->lock); + + if (!list_empty(head) && + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) + return -EBUSY; + + list_add(&wq_entry->entry, head); + return 0; +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; -- cgit v1.2.3