From 4b24910c056995c0c0fa7c1b142696443b05fd8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Dec 2025 16:16:50 -0800 Subject: KVM: Add a simplified wrapper for registering perf callbacks Add a parameter-less API for registering perf callbacks in anticipation of introducing another x86-only parameter for handling mediated PMU PMIs. No functional change intended. Acked-by: Anup Patel Tested-by: Xudong Hao Tested-by: Manali Shukla Link: https://patch.msgid.link/20251206001720.468579-15-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..8e410d1a63df 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1749,10 +1749,17 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)); + +static inline void kvm_register_perf_callbacks(void) +{ + __kvm_register_perf_callbacks(NULL, NULL); +} + void kvm_unregister_perf_callbacks(void); #else -static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_register_perf_callbacks(void) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ -- cgit v1.2.3 From 70b02809ded96ec790721cd5061e20b63b622310 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Nov 2025 14:34:41 -0800 Subject: KVM: x86: Mark vmcs12 pages as dirty if and only if they're mapped Mark vmcs12 pages as dirty (in KVM's dirty log bitmap) if and only if the page is mapped, i.e. if the page is actually "active" in vmcs02. For some pages, KVM simply disables the associated VMCS control if the vmcs12 page is unreachable, i.e. it's possible for nested VM-Enter to succeed with a "bad" vmcs12 page. Link: https://patch.msgid.link/20251121223444.355422-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++------------ include/linux/kvm_host.h | 9 ++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6137e5307d0f..72fcb1228af4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3984,23 +3984,14 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Don't need to mark the APIC access page dirty; it is never * written to by the CPU during APIC virtualization. */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..536d05e2726f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1381,6 +1381,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); +void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); @@ -1398,6 +1399,13 @@ static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_vcpu_map(vcpu, gpa, map, false); } +static inline void kvm_vcpu_map_mark_dirty(struct kvm_vcpu *vcpu, + struct kvm_host_map *map) +{ + if (kvm_vcpu_mapped(map)) + kvm_vcpu_mark_page_dirty(vcpu, map->gfn); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, @@ -1410,7 +1418,6 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data int offset, int len); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len); -void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** * kvm_gpc_init - initialize gfn_to_pfn_cache. -- cgit v1.2.3 From 6538b6221cc2feda415ca1946e66a5ef02dc6a0a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:18 -0600 Subject: KVM: guest_memfd: Remove partial hugepage handling from kvm_gmem_populate() kvm_gmem_populate(), and the associated post-populate callbacks, have some limited support for dealing with guests backed by hugepages by passing the order information along to each post-populate callback and iterating through the pages passed to kvm_gmem_populate() in hugepage-chunks. However, guest_memfd doesn't yet support hugepages, and in most cases additional changes in the kvm_gmem_populate() path would also be needed to actually allow for this functionality. This makes the existing code unnecessarily complex, and makes changes difficult to work through upstream due to theoretical impacts on hugepage support that can't be considered properly without an actual hugepage implementation to reference. So for now, remove what's there so changes for things like in-place conversion can be implemented/reviewed more efficiently. Suggested-by: Vishal Annapurve Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-3-michael.roth@amd.com [sean: check for !IS_ERR() before checking folio_order()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 94 +++++++++++++++++++----------------------------- arch/x86/kvm/vmx/tdx.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 30 +++++++++------- 4 files changed, 56 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 261d9ef8631b..a70bd3f19e29 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2267,67 +2267,53 @@ struct sev_gmem_populate_args { int fw_error; }; -static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, - void __user *src, int order, void *opaque) +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; + struct sev_data_snp_launch_update fw_args = {0}; struct kvm_sev_info *sev = to_kvm_sev_info(kvm); - int n_private = 0, ret, i; - int npages = (1 << order); - gfn_t gfn; + bool assigned = false; + int level; + int ret; if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) return -EINVAL; - for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { - struct sev_data_snp_launch_update fw_args = {0}; - bool assigned = false; - int level; - - ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); - if (ret || assigned) { - pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", - __func__, gfn, ret, assigned); - ret = ret ? -EINVAL : -EEXIST; - goto err; - } + ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto out; + } - if (src) { - void *vaddr = kmap_local_pfn(pfn + i); + if (src) { + void *vaddr = kmap_local_pfn(pfn); - if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto err; - } + if (copy_from_user(vaddr, src, PAGE_SIZE)) { kunmap_local(vaddr); + ret = -EFAULT; + goto out; } - - ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, - sev_get_asid(kvm), true); - if (ret) - goto err; - - n_private++; - - fw_args.gctx_paddr = __psp_pa(sev->snp_context); - fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); - fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); - fw_args.page_type = sev_populate_args->type; - - ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, - &fw_args, &sev_populate_args->fw_error); - if (ret) - goto fw_err; + kunmap_local(vaddr); } - return 0; + ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto out; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; -fw_err: + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); /* * If the firmware command failed handle the reclaim and cleanup of that - * PFN specially vs. prior pages which can be cleaned up below without - * needing to reclaim in advance. + * PFN before reporting an error. * * Additionally, when invalid CPUID function entries are detected, * firmware writes the expected values into the page and leaves it @@ -2337,26 +2323,20 @@ fw_err: * information to provide information on which CPUID leaves/fields * failed CPUID validation. */ - if (!snp_page_reclaim(kvm, pfn + i) && + if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn + i); + void *vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + if (copy_to_user(src, vaddr, PAGE_SIZE)) pr_debug("Failed to write CPUID page back to userspace\n"); kunmap_local(vaddr); } - /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ - n_private--; - -err: - pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", - __func__, ret, sev_populate_args->fw_error, n_private); - for (i = 0; i < n_private; i++) - kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); - +out: + pr_debug("%s: exiting with return code %d (fw_error %d)\n", + __func__, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2d7a4d52ccfb..4fb042ce8ed1 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,7 +3118,7 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *_arg) + void __user *src, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..1d0cee72e560 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, int order, void *opaque); + void __user *src, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fdaea3422c30..24eb33c7948d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -151,6 +151,15 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) mapping_gfp_mask(inode->i_mapping), policy); mpol_cond_put(policy); + /* + * External interfaces like kvm_gmem_get_pfn() support dealing + * with hugepages to a degree, but internally, guest_memfd currently + * assumes that all folios are order-0 and handling would need + * to be updated for anything otherwise (e.g. page-clearing + * operations). + */ + WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio)); + return folio; } @@ -829,7 +838,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long struct kvm_memory_slot *slot; void __user *p; - int ret = 0, max_order; + int ret = 0; long i; lockdep_assert_held(&kvm->slots_lock); @@ -848,7 +857,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long filemap_invalidate_lock(file->f_mapping); npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); - for (i = 0; i < npages; i += (1 << max_order)) { + for (i = 0; i < npages; i++) { struct folio *folio; gfn_t gfn = start_gfn + i; pgoff_t index = kvm_gmem_get_index(slot, gfn); @@ -860,7 +869,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, NULL); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; @@ -874,20 +883,15 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long } folio_unlock(folio); - WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || - (npages - i) < (1 << max_order)); ret = -EINVAL; - while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) { - if (!max_order) - goto put_folio_and_exit; - max_order--; - } + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) + goto put_folio_and_exit; p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + ret = post_populate(kvm, gfn, pfn, p, opaque); if (!ret) kvm_gmem_mark_prepared(folio); -- cgit v1.2.3 From 2a62345b30529e488beb6a1220577b3495933724 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 8 Jan 2026 15:46:22 -0600 Subject: KVM: guest_memfd: GUP source pages prior to populating guest memory Currently the post-populate callbacks handle copying source pages into private GPA ranges backed by guest_memfd, where kvm_gmem_populate() acquires the filemap invalidate lock, then calls a post-populate callback which may issue a get_user_pages() on the source pages prior to copying them into the private GPA (e.g. TDX). This will not be compatible with in-place conversion, where the userspace page fault path will attempt to acquire the filemap invalidate lock while holding the mm->mmap_lock, leading to a potential ABBA deadlock. Address this by hoisting the GUP above the filemap invalidate lock so that these page faults path can be taken early, prior to acquiring the filemap invalidate lock. It's not currently clear whether this issue is reachable with the current implementation of guest_memfd, which doesn't support in-place conversion, however it does provide a consistent mechanism to provide stable source/target PFNs to callbacks rather than punting to vendor-specific code, which allows for more commonality across architectures, which may be worthwhile even without in-place conversion. As part of this change, also begin enforcing that the 'src' argument to kvm_gmem_populate() must be page-aligned, as this greatly reduces the complexity around how the post-populate callbacks are implemented, and since no current in-tree users support using a non-page-aligned 'src' argument. Suggested-by: Sean Christopherson Co-developed-by: Sean Christopherson Co-developed-by: Vishal Annapurve Signed-off-by: Vishal Annapurve Tested-by: Vishal Annapurve Tested-by: Kai Huang Signed-off-by: Michael Roth Tested-by: Yan Zhao Reviewed-by: Yan Zhao Link: https://patch.msgid.link/20260108214622.1084057-7-michael.roth@amd.com [sean: avoid local "p" variable] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 33 +++++++++---------- arch/x86/kvm/vmx/tdx.c | 16 ++-------- include/linux/kvm_host.h | 4 +-- virt/kvm/guest_memfd.c | 83 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 78 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index b4409bc652d1..0ab7c89262fb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2268,7 +2268,7 @@ struct sev_gmem_populate_args { }; static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque) + struct page *src_page, void *opaque) { struct sev_gmem_populate_args *sev_populate_args = opaque; struct sev_data_snp_launch_update fw_args = {0}; @@ -2277,7 +2277,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int level; int ret; - if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) return -EINVAL; ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); @@ -2288,15 +2288,14 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, goto out; } - if (src) { - void *vaddr = kmap_local_pfn(pfn); + if (src_page) { + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_from_user(vaddr, src, PAGE_SIZE)) { - kunmap_local(vaddr); - ret = -EFAULT; - goto out; - } - kunmap_local(vaddr); + memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); + + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, @@ -2326,17 +2325,19 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret && !snp_page_reclaim(kvm, pfn) && sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { - void *vaddr = kmap_local_pfn(pfn); + void *src_vaddr = kmap_local_page(src_page); + void *dst_vaddr = kmap_local_pfn(pfn); - if (copy_to_user(src, vaddr, PAGE_SIZE)) - pr_debug("Failed to write CPUID page back to userspace\n"); + memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); - kunmap_local(vaddr); + kunmap_local(src_vaddr); + kunmap_local(dst_vaddr); } out: - pr_debug("%s: exiting with return code %d (fw_error %d)\n", - __func__, ret, sev_populate_args->fw_error); + if (ret) + pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", + __func__, gfn, ret, sev_populate_args->fw_error); return ret; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 4fb042ce8ed1..5df9d32d2058 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -3118,34 +3118,24 @@ struct tdx_gmem_post_populate_arg { }; static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *_arg) + struct page *src_page, void *_arg) { struct tdx_gmem_post_populate_arg *arg = _arg; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - struct page *src_page; int ret, i; if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) return -EIO; - /* - * Get the source page if it has been faulted in. Return failure if the - * source page has been swapped out or unmapped in primary memory. - */ - ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); - if (ret < 0) - return ret; - if (ret != 1) - return -ENOMEM; + if (!src_page) + return -EOPNOTSUPP; kvm_tdx->page_add_src = src_page; ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); kvm_tdx->page_add_src = NULL; - put_page(src_page); - if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) return ret; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1d0cee72e560..49c0cfe24fd8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2566,7 +2566,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * @gfn: starting GFN to be populated * @src: userspace-provided buffer containing data to copy into GFN range * (passed to @post_populate, and incremented on each iteration - * if not NULL) + * if not NULL). Must be page-aligned. * @npages: number of pages to copy from userspace-buffer * @post_populate: callback to issue for each gmem page that backs the GPA * range @@ -2581,7 +2581,7 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord * Returns the number of pages that were populated. */ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - void __user *src, void *opaque); + struct page *page, void *opaque); long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e90879322fd0..923c51a3a525 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -820,12 +820,48 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE + +static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot, + struct file *file, gfn_t gfn, struct page *src_page, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + pgoff_t index = kvm_gmem_get_index(slot, gfn); + struct folio *folio; + kvm_pfn_t pfn; + int ret; + + filemap_invalidate_lock(file->f_mapping); + + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_unlock; + } + + folio_unlock(folio); + + if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + ret = -EINVAL; + goto out_put_folio; + } + + ret = post_populate(kvm, gfn, pfn, src_page, opaque); + if (!ret) + folio_mark_uptodate(folio); + +out_put_folio: + folio_put(folio); +out_unlock: + filemap_invalidate_unlock(file->f_mapping); + return ret; +} + long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { struct kvm_memory_slot *slot; - void __user *p; - int ret = 0; long i; @@ -834,6 +870,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (WARN_ON_ONCE(npages <= 0)) return -EINVAL; + if (WARN_ON_ONCE(!PAGE_ALIGNED(src))) + return -EINVAL; + slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; @@ -842,47 +881,37 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!file) return -EFAULT; - filemap_invalidate_lock(file->f_mapping); - npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); for (i = 0; i < npages; i++) { - struct folio *folio; - gfn_t gfn = start_gfn + i; - pgoff_t index = kvm_gmem_get_index(slot, gfn); - kvm_pfn_t pfn; + struct page *src_page = NULL; if (signal_pending(current)) { ret = -EINTR; break; } - folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - break; + if (src) { + unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE; + + ret = get_user_pages_fast(uaddr, 1, 0, &src_page); + if (ret < 0) + break; + if (ret != 1) { + ret = -ENOMEM; + break; + } } - folio_unlock(folio); + ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page, + post_populate, opaque); - ret = -EINVAL; - if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1, - KVM_MEMORY_ATTRIBUTE_PRIVATE, - KVM_MEMORY_ATTRIBUTE_PRIVATE)) - goto put_folio_and_exit; + if (src_page) + put_page(src_page); - p = src ? src + i * PAGE_SIZE : NULL; - ret = post_populate(kvm, gfn, pfn, p, opaque); - if (!ret) - folio_mark_uptodate(folio); - -put_folio_and_exit: - folio_put(folio); if (ret) break; } - filemap_invalidate_unlock(file->f_mapping); - return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); -- cgit v1.2.3 From 3227c3a89d65fe7482312b7b27038d9ebd86f210 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 28 Jan 2026 18:07:33 +0000 Subject: irqchip/gic-v5: Check if impl is virt capable Now that there is support for creating a GICv5-based guest with KVM, check that the hardware itself supports virtualisation, skipping the setting of struct gic_kvm_info if not. Note: If native GICv5 virt is not supported, then nor is FEAT_GCIE_LEGACY, so we are able to skip altogether. Signed-off-by: Sascha Bischoff Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260128175919.3828384-33-sascha.bischoff@arm.com [maz: cosmetic changes] Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5-irs.c | 2 ++ drivers/irqchip/irq-gic-v5.c | 10 ++++++++++ include/linux/irqchip/arm-gic-v5.h | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c index ce2732d649a3..eeeb40fb0eaa 100644 --- a/drivers/irqchip/irq-gic-v5-irs.c +++ b/drivers/irqchip/irq-gic-v5-irs.c @@ -743,6 +743,8 @@ static int __init gicv5_irs_init(struct device_node *node) * be consistent across IRSes by the architecture. */ if (list_empty(&irs_nodes)) { + idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR0); + gicv5_global_data.virt_capable = !FIELD_GET(GICV5_IRS_IDR0_VIRT, idr); idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1); irs_setup_pri_bits(idr); diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 41ef286c4d78..3c86bbc05761 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -1064,6 +1064,16 @@ static struct gic_kvm_info gic_v5_kvm_info __initdata; static void __init gic_of_setup_kvm_info(struct device_node *node) { + /* + * If we don't have native GICv5 virtualisation support, then + * we also don't have FEAT_GCIE_LEGACY - the architecture + * forbids this combination. + */ + if (!gicv5_global_data.virt_capable) { + pr_info("GIC implementation is not virtualization capable\n"); + return; + } + gic_v5_kvm_info.type = GIC_V5; /* GIC Virtual CPU interface maintenance interrupt */ diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 68ddcdb1cec5..4cb71ce6e8ad 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -43,6 +43,7 @@ /* * IRS registers and tables structures */ +#define GICV5_IRS_IDR0 0x0000 #define GICV5_IRS_IDR1 0x0004 #define GICV5_IRS_IDR2 0x0008 #define GICV5_IRS_IDR5 0x0014 @@ -63,6 +64,8 @@ #define GICV5_IRS_IST_STATUSR 0x0194 #define GICV5_IRS_MAP_L2_ISTR 0x01c0 +#define GICV5_IRS_IDR0_VIRT BIT(6) + #define GICV5_IRS_IDR1_PRIORITY_BITS GENMASK(22, 20) #define GICV5_IRS_IDR1_IAFFID_BITS GENMASK(19, 16) @@ -278,6 +281,7 @@ struct gicv5_chip_data { u8 cpuif_pri_bits; u8 cpuif_id_bits; u8 irs_pri_bits; + bool virt_capable; struct { __le64 *l1ist_addr; u32 l2_size; -- cgit v1.2.3 From 376e2f8cca2816c489a9196e65cc904d1a907fd2 Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Sun, 4 Jan 2026 21:34:57 +0800 Subject: irqchip/riscv-imsic: Adjust the number of available guest irq files Currently, KVM assumes the minimum of implemented HGEIE bits and "BIT(gc->guest_index_bits) - 1" as the number of guest files available across all CPUs. This will not work when CPUs have different number of guest files because KVM may incorrectly allocate a guest file on a CPU with fewer guest files. To address above, during initialization, calculate the number of available guest interrupt files according to MMIO resources and constrain the number of guest interrupt files that can be allocated by KVM. Signed-off-by: Xu Lu Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20260104133457.57742-1-luxu.kernel@bytedance.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 2 +- drivers/irqchip/irq-riscv-imsic-state.c | 12 +++++++++++- include/linux/irqchip/riscv-imsic.h | 3 +++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index dad318185660..cac3c2b51d72 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -630,7 +630,7 @@ int kvm_riscv_aia_init(void) */ if (gc) kvm_riscv_aia_nr_hgei = min((ulong)kvm_riscv_aia_nr_hgei, - BIT(gc->guest_index_bits) - 1); + gc->nr_guest_files); else kvm_riscv_aia_nr_hgei = 0; diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index b6cebfee9461..7566c8aa2d48 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -784,7 +784,7 @@ static int __init imsic_parse_fwnode(struct fwnode_handle *fwnode, int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) { - u32 i, j, index, nr_parent_irqs, nr_mmios, nr_handlers = 0; + u32 i, j, index, nr_parent_irqs, nr_mmios, nr_guest_files, nr_handlers = 0; struct imsic_global_config *global; struct imsic_local_config *local; void __iomem **mmios_va = NULL; @@ -878,6 +878,7 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) } /* Configure handlers for target CPUs */ + global->nr_guest_files = BIT(global->guest_index_bits) - 1; for (i = 0; i < nr_parent_irqs; i++) { rc = imsic_get_parent_hartid(fwnode, i, &hartid); if (rc) { @@ -918,6 +919,15 @@ int __init imsic_setup_state(struct fwnode_handle *fwnode, void *opaque) local->msi_pa = mmios[index].start + reloff; local->msi_va = mmios_va[index] + reloff; + /* + * KVM uses global->nr_guest_files to determine the available guest + * interrupt files on each CPU. Take the minimum number of guest + * interrupt files across all CPUs to avoid KVM incorrectly allocating + * an unexisted or unmapped guest interrupt file on some CPUs. + */ + nr_guest_files = (resource_size(&mmios[index]) - reloff) / IMSIC_MMIO_PAGE_SZ - 1; + global->nr_guest_files = min(global->nr_guest_files, nr_guest_files); + nr_handlers++; } diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7f3ff5c5ea53..4b348836de7a 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -68,6 +68,9 @@ struct imsic_global_config { /* Number of guest interrupt identities */ u32 nr_guest_ids; + /* Number of guest interrupt files per core */ + u32 nr_guest_files; + /* Per-CPU IMSIC addresses */ struct imsic_local_config __percpu *local; }; -- cgit v1.2.3 From 898885477e0fa23d2e42b65bcb7c250215ecac37 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 6 Feb 2026 15:35:51 +0100 Subject: KVM: s390: Use guest address to mark guest page dirty Stop using the userspace address to mark the guest page dirty. mark_page_dirty() expects a guest frame number, but was being passed a host virtual frame number. When slot == NULL, mark_page_dirty_in_slot() does nothing and does not complain. This means that in some circumstances the dirtiness of the guest page might have been lost. Fix by adding two fields in struct kvm_s390_adapter_int to keep the guest addressses, and use those for mark_page_dirty(). Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/interrupt.c | 6 ++++-- include/linux/kvm_host.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f55eca9aa638..1c2bb5cd7e12 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d93f75b05ae2..deb36007480d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -645,7 +645,9 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem struct kvm_s390_adapter_int { u64 ind_addr; + u64 ind_gaddr; u64 summary_addr; + u64 summary_gaddr; u64 ind_offset; u32 summary_offset; u32 adapter_id; -- cgit v1.2.3