From 49ae248b61aefa0eff84dca8e81bd9306cdaa6c9 Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Thu, 18 Nov 2021 11:25:22 +0100 Subject: KVM: s390: Fix names of skey constants in api documentation They are defined in include/uapi/linux/kvm.h as KVM_S390_GET_SKEYS_NONE and KVM_S390_SKEYS_MAX, but the api documetation talks of KVM_S390_GET_KEYS_NONE and KVM_S390_SKEYS_ALLOC_MAX respectively. Signed-off-by: Janis Schoetterl-Glausch Reviewed-by: Janosch Frank Message-Id: <20211118102522.569660-1-scgl@linux.ibm.com> Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index aeeb071c7688..b86c7edae888 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3701,7 +3701,7 @@ KVM with the currently defined set of flags. :Architectures: s390 :Type: vm ioctl :Parameters: struct kvm_s390_skeys -:Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage +:Returns: 0 on success, KVM_S390_GET_SKEYS_NONE if guest is not using storage keys, negative value on error This ioctl is used to get guest storage key values on the s390 @@ -3720,7 +3720,7 @@ you want to get. The count field is the number of consecutive frames (starting from start_gfn) whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +allowed value is defined as KVM_S390_SKEYS_MAX. Values outside this range will cause the ioctl to return -EINVAL. The skeydata_addr field is the address to a buffer large enough to hold count @@ -3744,7 +3744,7 @@ you want to set. The count field is the number of consecutive frames (starting from start_gfn) whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +allowed value is defined as KVM_S390_SKEYS_MAX. Values outside this range will cause the ioctl to return -EINVAL. The skeydata_addr field is the address to a buffer containing count bytes of -- cgit v1.2.3 From bb3b394d35e80d7a58ce015191e4960a13f54ba5 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:51 +0800 Subject: KVM: X86: Rename gpte_is_8_bytes to has_4_byte_gpte and invert the direction This bit is very close to mean "role.quadrant is not in use", except that it is false also when the MMU is mapping guest physical addresses directly. In that case, role.quadrant is indeed not in use, but there are no guest PTEs at all. Changing the name and direction of the bit removes the special case, since a guest with paging disabled, or not considering guest paging structures as is the case for two-dimensional paging, does not have to deal with 4-byte guest PTEs. Suggested-by: Paolo Bonzini Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-10-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 8 ++++---- arch/x86/include/asm/kvm_host.h | 8 ++++---- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ arch/x86/kvm/mmu/mmutrace.h | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index f60f5488e121..5b1ebad24c77 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -161,7 +161,7 @@ Shadow pages contain the following information: If clear, this page corresponds to a guest page table denoted by the gfn field. role.quadrant: - When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit + When role.has_4_byte_gpte=1, the guest uses 32-bit gptes while the host uses 64-bit sptes. That means a guest page table contains more ptes than the host, so multiple shadow pages are needed to shadow one guest page. For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the @@ -177,9 +177,9 @@ Shadow pages contain the following information: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is unpinned it will be destroyed. - role.gpte_is_8_bytes: - Reflects the size of the guest PTE for which the page is valid, i.e. '1' - if 64-bit gptes are in use, '0' if 32-bit gptes are in use. + role.has_4_byte_gpte: + Reflects the size of the guest PTE for which the page is valid, i.e. '0' + if direct map or 64-bit gptes are in use, '1' if 32-bit gptes are in use. role.efer_nx: Contains the value of efer.nx for which the page is valid. role.cr0_wp: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f1414df18e11..5d48ba6d1487 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -296,14 +296,14 @@ struct kvm_kernel_irq_routing_entry; * * - invalid shadow pages are not accounted, so the bits are effectively 18 * - * - quadrant will only be used if gpte_is_8_bytes=0 (non-PAE paging); + * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has - * gpte_is_8_bytes=1. Therefore, 2 bits are always unused. + * has_4_byte_gpte=0. Therefore, 2 bits are always unused. * * - the 4 bits of level are effectively limited to the values 2/3/4/5, * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE * paging has exactly one upper level, making level completely redundant - * when gpte_is_8_bytes=0. + * when has_4_byte_gpte=1. * * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. @@ -315,7 +315,7 @@ union kvm_mmu_page_role { u32 word; struct { unsigned level:4; - unsigned gpte_is_8_bytes:1; + unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; unsigned access:3; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 41613963a455..1ccee4d17481 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2081,7 +2081,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.level = level; role.direct = direct; role.access = access; - if (!direct_mmu && !role.gpte_is_8_bytes) { + if (role.has_4_byte_gpte) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -4746,7 +4746,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; return role; } @@ -4791,7 +4791,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); + role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); return role; } @@ -4890,7 +4890,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; role.base.level = level; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; @@ -5168,7 +5168,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, gpa, bytes, sp->role.word); offset = offset_in_page(gpa); - pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; + pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status @@ -5192,7 +5192,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; - if (!sp->role.gpte_is_8_bytes) { + if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index b8151bbca36a..de5e8e4e1aa7 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -35,7 +35,7 @@ " %snxe %sad root %u %s%c", \ __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ - role.gpte_is_8_bytes ? 8 : 4, \ + role.has_4_byte_gpte ? 4 : 8, \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1db8496259ad..b69e47e68307 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -165,7 +165,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = true; - role.gpte_is_8_bytes = true; + role.has_4_byte_gpte = false; role.access = ACC_ALL; role.ad_disabled = !shadow_accessed_mask; -- cgit v1.2.3 From 1cfc9c4b9d4606a1e90e7dbc50058b9f0c1d43a6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:22 +0000 Subject: KVM: x86/xen: Maintain valid mapping of Xen shared_info page Use the newly reinstated gfn_to_pfn_cache to maintain a kernel mapping of the Xen shared_info page so that it can be accessed in atomic context. Note that we do not participate in dirty tracking for the shared info page and we do not explicitly mark it dirty every single tim we deliver an event channel interrupts. We wouldn't want to do that even if we *did* have a valid vCPU context with which to do so. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-4-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 12 ++++++++++++ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/xen.c | 25 ++++++++++++++----------- 3 files changed, 27 insertions(+), 12 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b86c7edae888..c168be764707 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -371,6 +371,9 @@ The bits in the dirty bitmap are cleared before the ioctl returns, unless KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, see the description of the capability. +Note that the Xen shared info page, if configured, shall always be assumed +to be dirty. KVM will not explicitly mark it such. + 4.9 KVM_SET_MEMORY_ALIAS ------------------------ @@ -5134,6 +5137,15 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO not aware of the Xen CPU id which is used as the index into the vcpu_info[] array, so cannot know the correct default location. + Note that the shared info page may be constantly written to by KVM; + it contains the event channel bitmap used to deliver interrupts to + a Xen guest, amongst other things. It is exempt from dirty tracking + mechanisms — KVM will not explicitly mark the page as dirty each + time an event channel interrupt is delivered to the guest! Thus, + userspace should always assume that the designated GFN is dirty if + any vCPU has been running or any event channel interrupts can be + routed to the guest. + KVM_XEN_ATTR_TYPE_UPCALL_VECTOR Sets the exception vector used to deliver Xen event channel upcalls. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ce622b89c5d8..6e61e11e750f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1020,7 +1020,7 @@ struct msr_bitmap_range { struct kvm_xen { bool long_mode; u8 upcall_vector; - gfn_t shinfo_gfn; + struct gfn_to_pfn_cache shinfo_cache; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index dff2bdf9507a..da4bf2c6407f 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -23,16 +23,21 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { - ret = -EFAULT; + if (gfn == GPA_INVALID) { + kvm_gfn_to_pfn_cache_destroy(kvm, gpc); goto out; } - kvm->arch.xen.shinfo_gfn = gfn; + + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, gpa, + PAGE_SIZE, false); + if (ret) + goto out; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -260,15 +265,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; - r = 0; - break; - } r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); break; - case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; @@ -299,7 +298,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn; + if (kvm->arch.xen.shinfo_cache.active) + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); + else + data->u.shared_info.gfn = GPA_INVALID; r = 0; break; @@ -661,11 +663,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) void kvm_xen_init_vm(struct kvm *kvm) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; } void kvm_xen_destroy_vm(struct kvm *kvm) { + kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + if (kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } -- cgit v1.2.3 From 14243b387137a4afbe1df5d9dc15182d6657bb79 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 10 Dec 2021 16:36:23 +0000 Subject: KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery This adds basic support for delivering 2 level event channels to a guest. Initially, it only supports delivery via the IRQ routing table, triggered by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast() function which will use the pre-mapped shared_info page if it already exists and is still valid, while the slow path through the irqfd_inject workqueue will remap the shared_info page if necessary. It sets the bits in the shared_info page but not the vcpu_info; that is deferred to __kvm_xen_has_interrupt() which raises the vector to the appropriate vCPU. Add a 'verbose' mode to xen_shinfo_test while adding test cases for this. Signed-off-by: David Woodhouse Message-Id: <20211210163625.2886-5-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 21 ++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq_comm.c | 12 + arch/x86/kvm/x86.c | 3 +- arch/x86/kvm/xen.c | 262 ++++++++++++++++++++- arch/x86/kvm/xen.h | 9 + include/linux/kvm_host.h | 7 + include/uapi/linux/kvm.h | 11 + .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 184 ++++++++++++++- 9 files changed, 503 insertions(+), 7 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c168be764707..6b683dfea8f2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1799,6 +1799,7 @@ No flags are specified so far, the corresponding field must be set to zero. struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; @@ -1808,6 +1809,7 @@ No flags are specified so far, the corresponding field must be set to zero. #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 + #define KVM_IRQ_ROUTING_XEN_EVTCHN 5 flags: @@ -1859,6 +1861,20 @@ address_hi must be zero. __u32 sint; }; + struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; + }; + + +When KVM_CAP_XEN_HVM includes the KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL bit +in its indication of supported features, routing to Xen event channels +is supported. Although the priority field is present, only the value +KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL is supported, which means delivery by +2 level event channels. FIFO event channel support may be added in +the future. + 4.55 KVM_SET_TSC_KHZ -------------------- @@ -7413,6 +7429,7 @@ PVHVM guests. Valid flags are:: #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 2) + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 3) The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG ioctl is available, for the guest to set its hypercall page. @@ -7432,6 +7449,10 @@ The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls. +The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries +of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority +field set to indicate 2 level event channel delivery. + 8.31 KVM_CAP_PPC_MULTITCE ------------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6e61e11e750f..623fb7c4992c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -610,6 +610,7 @@ struct kvm_vcpu_xen { u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; + unsigned long evtchn_pending_sel; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 39ad02d6dc63..6e0dab04320e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -24,6 +24,7 @@ #include "hyperv.h" #include "x86.h" +#include "xen.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return r; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn_fast(e, kvm); +#endif default: break; } @@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif default: return -EINVAL; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bde45a1bc2..3050601d5d73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4188,7 +4188,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_SHARED_INFO; + KVM_XEN_HVM_CONFIG_SHARED_INFO | + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index da4bf2c6407f..ceddabd1f5c6 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "trace.h" @@ -195,6 +196,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); + bool atomic = in_atomic() || !task_is_running(current); int err; u8 rc = 0; @@ -204,6 +207,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool ghc_valid = slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot; + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ @@ -219,8 +225,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * cache in kvm_read_guest_offset_cached(), but just uses * __get_user() instead. And falls back to the slow path. */ - if (likely(slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + if (!evtchn_pending_sel && ghc_valid) { /* Fast path */ pagefault_disable(); err = __get_user(rc, (u8 __user *)ghc->hva + offset); @@ -239,11 +244,82 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ - if (in_atomic() || !task_is_running(current)) + if (atomic) return 1; - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + if (!ghc_valid) { + err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); + if (err || !ghc->memslot) { + /* + * If this failed, userspace has screwed up the + * vcpu_info mapping. No interrupts for you. + */ + return 0; + } + } + + /* + * Now we have a valid (protected by srcu) userspace HVA in + * ghc->hva which points to the struct vcpu_info. If there + * are any bits in the in-kernel evtchn_pending_sel then + * we need to write those to the guest vcpu_info and set + * its evtchn_upcall_pending flag. If there aren't any bits + * to add, we only want to *check* evtchn_upcall_pending. + */ + if (evtchn_pending_sel) { + bool long_mode = v->kvm->arch.xen.long_mode; + + if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info))) + return 0; + + if (IS_ENABLED(CONFIG_64BIT) && long_mode) { + struct vcpu_info __user *vi = (void __user *)ghc->hva; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" + "\tnotq %0\n" + "\t" LOCK_PREFIX "andq %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel)); + } else { + struct compat_vcpu_info __user *vi = (void __user *)ghc->hva; + u32 evtchn_pending_sel32 = evtchn_pending_sel; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" + "\tnotl %0\n" + "\t" LOCK_PREFIX "andl %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel32), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel32)); + } + rc = 1; + unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); + + err: + user_access_end(); + + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + } else { + __get_user(rc, (u8 __user *)ghc->hva + offset); + } return rc; } @@ -740,3 +816,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } + +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return EVTCHN_2L_NR_CHANNELS; + else + return COMPAT_EVTCHN_2L_NR_CHANNELS; +} + +/* + * This follows the kvm_set_irq() API, so it returns: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + struct kvm_vcpu *vcpu; + unsigned long *pending_bits, *mask_bits; + unsigned long flags; + int port_word_bit; + bool kick_vcpu = false; + int idx; + int rc; + + vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); + if (!vcpu) + return -1; + + if (!vcpu->arch.xen.vcpu_info_set) + return -1; + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -1; + + rc = -EWOULDBLOCK; + read_lock_irqsave(&gpc->lock, flags); + + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + goto out_rcu; + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 64; + } else { + struct compat_shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 32; + } + + /* + * If this port wasn't already set, and if it isn't masked, then + * we try to set the corresponding bit in the in-kernel shadow of + * evtchn_pending_sel for the target vCPU. And if *that* wasn't + * already set, then we kick the vCPU in question to write to the + * *real* evtchn_pending_sel in its own guest vcpu_info struct. + */ + if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) { + rc = 0; /* It was already raised */ + } else if (test_bit(e->xen_evtchn.port, mask_bits)) { + rc = -1; /* Masked */ + } else { + rc = 1; /* Delivered. But was the vCPU waking already? */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + } + + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + read_unlock_irqrestore(&gpc->lock, flags); + + if (kick_vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + + return rc; +} + +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + bool mm_borrowed = false; + int rc; + + if (!level) + return -1; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + return rc; + + if (current->mm != kvm->mm) { + /* + * If not on a thread which already belongs to this KVM, + * we'd better be in the irqfd workqueue. + */ + if (WARN_ON_ONCE(current->mm)) + return -EINVAL; + + kthread_use_mm(kvm->mm); + mm_borrowed = true; + } + + /* + * For the irqfd workqueue, using the main kvm->lock mutex is + * fine since this function is invoked from kvm_set_irq() with + * no other lock held, no srcu. In future if it will be called + * directly from a vCPU thread (e.g. on hypercall for an IPI) + * then it may need to switch to using a leaf-node mutex for + * serializing the shared_info mapping. + */ + mutex_lock(&kvm->lock); + + /* + * It is theoretically possible for the page to be unmapped + * and the MMU notifier to invalidate the shared_info before + * we even get to use it. In that case, this looks like an + * infinite loop. It was tempting to do it via the userspace + * HVA instead... but that just *hides* the fact that it's + * an infinite loop, because if a fault occurs and it waits + * for the page to come back, it can *still* immediately + * fault and have to wait again, repeatedly. + * + * Conversely, the page could also have been reinstated by + * another thread before we even obtain the mutex above, so + * check again *first* before remapping it. + */ + do { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + int idx; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + break; + + idx = srcu_read_lock(&kvm->srcu); + rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, + PAGE_SIZE, false); + srcu_read_unlock(&kvm->srcu, idx); + } while(!rc); + + mutex_unlock(&kvm->lock); + + if (mm_borrowed) + kthread_unuse_mm(kvm->mm); + + return rc; +} + +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) + +{ + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e->xen_evtchn.port = ue->u.xen_evtchn.port; + e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.priority = ue->u.xen_evtchn.priority; + e->set = evtchn_set_fn; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..adbcc9ed59db 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm); +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && @@ -134,6 +140,9 @@ struct compat_shared_info { struct compat_arch_shared_info arch; }; +#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \ + sizeof_field(struct compat_shared_info, \ + evtchn_pending)) struct compat_vcpu_runstate_info { int state; uint64_t state_entry_time; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9bbb1f1d9e48..3c47b146851a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -497,6 +497,12 @@ struct kvm_hv_sint { u32 sint; }; +struct kvm_xen_evtchn { + u32 port; + u32 vcpu; + u32 priority; +}; + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -517,6 +523,7 @@ struct kvm_kernel_irq_routing_entry { } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; + struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 469f05d69c8d..fbfd70d965c6 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1163,11 +1163,20 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; +struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; +}; + +#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 +#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 struct kvm_irq_routing_entry { __u32 gsi; @@ -1179,6 +1188,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; @@ -1209,6 +1219,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) struct kvm_xen_hvm_config { __u32 flags; diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index a0699f00b3d6..478e0ae8b93e 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -14,6 +14,9 @@ #include #include #include +#include + +#include #define VCPU_ID 5 @@ -22,10 +25,15 @@ #define SHINFO_REGION_SLOT 10 #define PAGE_SIZE 4096 +#define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (2 * PAGE_SIZE)) +#define DUMMY_REGION_SLOT 11 + +#define SHINFO_ADDR (SHINFO_REGION_GPA) #define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) #define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + 0x20) #define VCPU_INFO_ADDR (SHINFO_REGION_GPA + 0x40) +#define SHINFO_VADDR (SHINFO_REGION_GVA) #define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + 0x20) #define VCPU_INFO_VADDR (SHINFO_REGION_GVA + 0x40) @@ -73,15 +81,37 @@ struct vcpu_info { struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ +struct shared_info { + struct vcpu_info vcpu_info[32]; + unsigned long evtchn_pending[64]; + unsigned long evtchn_mask[64]; + struct pvclock_wall_clock wc; + uint32_t wc_sec_hi; + /* arch_shared_info here */ +}; + #define RUNSTATE_running 0 #define RUNSTATE_runnable 1 #define RUNSTATE_blocked 2 #define RUNSTATE_offline 3 +static const char *runstate_names[] = { + "running", + "runnable", + "blocked", + "offline" +}; + +struct { + struct kvm_irq_routing info; + struct kvm_irq_routing_entry entries[2]; +} irq_routes; + static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; vi->evtchn_upcall_pending = 0; + vi->evtchn_pending_sel = 0; GUEST_SYNC(0x20); } @@ -127,7 +157,25 @@ static void guest_code(void) GUEST_SYNC(6); GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME); - GUEST_DONE(); + /* Attempt to deliver a *masked* interrupt */ + GUEST_SYNC(7); + + /* Wait until we see the bit set */ + struct shared_info *si = (void *)SHINFO_VADDR; + while (!si->evtchn_pending[0]) + __asm__ __volatile__ ("rep nop" : : : "memory"); + + /* Now deliver an *unmasked* interrupt */ + GUEST_SYNC(8); + + while (!si->evtchn_pending[1]) + __asm__ __volatile__ ("rep nop" : : : "memory"); + + /* Change memslots and deliver an interrupt */ + GUEST_SYNC(9); + + for (;;) + __asm__ __volatile__ ("rep nop" : : : "memory"); } static int cmp_timespec(struct timespec *a, struct timespec *b) @@ -144,9 +192,18 @@ static int cmp_timespec(struct timespec *a, struct timespec *b) return 0; } +static void handle_alrm(int sig) +{ + TEST_FAIL("IRQ delivery timed out"); +} + int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; + bool verbose; + + verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) || + !strncmp(argv[1], "--verbose", 10)); int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) { @@ -155,6 +212,7 @@ int main(int argc, char *argv[]) } bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); clock_gettime(CLOCK_REALTIME, &min_ts); @@ -166,6 +224,11 @@ int main(int argc, char *argv[]) SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2); + struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR); + + int zero_fd = open("/dev/zero", O_RDONLY); + TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero"); + struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, .msr = XEN_HYPERCALL_MSR, @@ -184,6 +247,16 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha); + /* + * Test what happens when the HVA of the shinfo page is remapped after + * the kernel has a reference to it. But make sure we copy the clock + * info over since that's only set at setup time, and we test it later. + */ + struct pvclock_wall_clock wc_copy = shinfo->wc; + void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0); + TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info"); + shinfo->wc = wc_copy; + struct kvm_xen_vcpu_attr vi = { .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, .u.gpa = VCPU_INFO_ADDR, @@ -214,6 +287,49 @@ int main(int argc, char *argv[]) vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); } + int irq_fd[2] = { -1, -1 }; + + if (do_eventfd_tests) { + irq_fd[0] = eventfd(0, 0); + irq_fd[1] = eventfd(0, 0); + + /* Unexpected, but not a KVM failure */ + if (irq_fd[0] == -1 || irq_fd[1] == -1) + do_eventfd_tests = false; + } + + if (do_eventfd_tests) { + irq_routes.info.nr = 2; + + irq_routes.entries[0].gsi = 32; + irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[0].u.xen_evtchn.port = 15; + irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + irq_routes.entries[1].gsi = 33; + irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[1].u.xen_evtchn.port = 66; + irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes); + + struct kvm_irqfd ifd = { }; + + ifd.fd = irq_fd[0]; + ifd.gsi = 32; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + ifd.fd = irq_fd[1]; + ifd.gsi = 33; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + struct sigaction sa = { }; + sa.sa_handler = handle_alrm; + sigaction(SIGALRM, &sa, NULL); + } + struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); vinfo->evtchn_upcall_pending = 0; @@ -248,6 +364,8 @@ int main(int argc, char *argv[]) switch (uc.args[1]) { case 0: + if (verbose) + printf("Delivering evtchn upcall\n"); evtchn_irq_expected = true; vinfo->evtchn_upcall_pending = 1; break; @@ -256,11 +374,16 @@ int main(int argc, char *argv[]) TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen"); if (!do_runstate_tests) goto done; + if (verbose) + printf("Testing runstate %s\n", runstate_names[uc.args[1]]); rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; rst.u.runstate.state = uc.args[1]; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); break; + case 4: + if (verbose) + printf("Testing RUNSTATE_ADJUST\n"); rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST; memset(&rst.u, 0, sizeof(rst.u)); rst.u.runstate.state = (uint64_t)-1; @@ -274,6 +397,8 @@ int main(int argc, char *argv[]) break; case 5: + if (verbose) + printf("Testing RUNSTATE_DATA\n"); rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA; memset(&rst.u, 0, sizeof(rst.u)); rst.u.runstate.state = RUNSTATE_running; @@ -282,16 +407,54 @@ int main(int argc, char *argv[]) rst.u.runstate.time_offline = 0x5a; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); break; + case 6: + if (verbose) + printf("Testing steal time\n"); /* Yield until scheduler delay exceeds target */ rundelay = get_run_delay() + MIN_STEAL_TIME; do { sched_yield(); } while (get_run_delay() < rundelay); break; + + case 7: + if (!do_eventfd_tests) + goto done; + if (verbose) + printf("Testing masked event channel\n"); + shinfo->evtchn_mask[0] = 0x8000; + eventfd_write(irq_fd[0], 1UL); + alarm(1); + break; + + case 8: + if (verbose) + printf("Testing unmasked event channel\n"); + /* Unmask that, but deliver the other one */ + shinfo->evtchn_pending[0] = 0; + shinfo->evtchn_mask[0] = 0; + eventfd_write(irq_fd[1], 1UL); + evtchn_irq_expected = true; + alarm(1); + break; + + case 9: + if (verbose) + printf("Testing event channel after memslot change\n"); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0); + eventfd_write(irq_fd[0], 1UL); + evtchn_irq_expected = true; + alarm(1); + break; + case 0x20: TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ"); evtchn_irq_expected = false; + if (shinfo->evtchn_pending[1] && + shinfo->evtchn_pending[0]) + goto done; break; } break; @@ -318,6 +481,16 @@ int main(int argc, char *argv[]) ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); ti2 = addr_gpa2hva(vm, PVTIME_ADDR); + if (verbose) { + printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec); + printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", + ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul, + ti->tsc_shift, ti->flags); + printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", + ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul, + ti2->tsc_shift, ti2->flags); + } + vm_ts.tv_sec = wc->sec; vm_ts.tv_nsec = wc->nsec; TEST_ASSERT(wc->version && !(wc->version & 1), @@ -341,6 +514,15 @@ int main(int argc, char *argv[]) }; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst); + if (verbose) { + printf("Runstate: %s(%d), entry %" PRIu64 " ns\n", + rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown", + rs->state, rs->state_entry_time); + for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) { + printf("State %s: %" PRIu64 " ns\n", + runstate_names[i], rs->time[i]); + } + } TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch"); TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time, "State entry time mismatch"); -- cgit v1.2.3 From 445ecdf79be0c71ca248f7611aeefceaea3ec59f Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Wed, 5 Jan 2022 04:35:15 -0800 Subject: kvm: x86: Exclude unpermitted xfeatures at KVM_GET_SUPPORTED_CPUID KVM_GET_SUPPORTED_CPUID should not include any dynamic xstates in CPUID[0xD] if they have not been requested with prctl. Otherwise a process which directly passes KVM_GET_SUPPORTED_CPUID to KVM_SET_CPUID2 would now fail even if it doesn't intend to use a dynamically enabled feature. Userspace must know that prctl is required and allocate >4K xstate buffer before setting any dynamic bit. Suggested-by: Paolo Bonzini Signed-off-by: Jing Liu Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-5-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++++ arch/x86/kvm/cpuid.c | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6b683dfea8f2..f4ea5e41a4d0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1687,6 +1687,10 @@ userspace capabilities, and with user requirements (for example, the user may wish to constrain cpuid to emulate older hardware, or for feature consistency across a cluster). +Dynamically-enabled feature bits need to be requested with +``arch_prctl()`` before calling this ioctl. Feature bits that have not +been requested are excluded from the result. + Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may expose cpuid features (e.g. MONITOR) which are not supported by kvm in its default configuration. If userspace enables such capabilities, it diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f3e6fda6b858..eb52dde5deec 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -815,11 +815,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; - case 0xd: - entry->eax &= supported_xcr0; + case 0xd: { + u64 guest_perm = xstate_get_guest_group_perm(); + + entry->eax &= supported_xcr0 & guest_perm; entry->ebx = xstate_required_size(supported_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= supported_xcr0 >> 32; + entry->edx &= (supported_xcr0 & guest_perm) >> 32; if (!supported_xcr0) break; @@ -866,6 +868,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { -- cgit v1.2.3 From be50b2065dfa3d88428fdfdc340d154d96bf6848 Mon Sep 17 00:00:00 2001 From: Guang Zeng Date: Wed, 5 Jan 2022 04:35:29 -0800 Subject: kvm: x86: Add support for getting/setting expanded xstate buffer With KVM_CAP_XSAVE, userspace uses a hardcoded 4KB buffer to get/set xstate data from/to KVM. This doesn't work when dynamic xfeatures (e.g. AMX) are exposed to the guest as they require a larger buffer size. Introduce a new capability (KVM_CAP_XSAVE2). Userspace VMM gets the required xstate buffer size via KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2). KVM_SET_XSAVE is extended to work with both legacy and new capabilities by doing properly-sized memdup_user() based on the guest fpu container. KVM_GET_XSAVE is kept for backward-compatible reason. Instead, KVM_GET_XSAVE2 is introduced under KVM_CAP_XSAVE2 as the preferred interface for getting xstate buffer (4KB or larger size) from KVM (Link: https://lkml.org/lkml/2021/12/15/510) Also, update the api doc with the new KVM_GET_XSAVE2 ioctl. Signed-off-by: Guang Zeng Signed-off-by: Wei Wang Signed-off-by: Jing Liu Signed-off-by: Kevin Tian Signed-off-by: Yang Zhong Message-Id: <20220105123532.12586-19-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 42 ++++++++++++++++++++++++++++++++++++-- arch/x86/include/uapi/asm/kvm.h | 16 ++++++++++++++- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 2 ++ arch/x86/kvm/x86.c | 45 ++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 4 ++++ 6 files changed, 106 insertions(+), 5 deletions(-) (limited to 'Documentation/virt/kvm') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f4ea5e41a4d0..d3791a14eb9a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1569,6 +1569,7 @@ otherwise it will return EBUSY error. struct kvm_xsave { __u32 region[1024]; + __u32 extra[0]; }; This ioctl would copy current vcpu's xsave struct to the userspace. @@ -1577,7 +1578,7 @@ This ioctl would copy current vcpu's xsave struct to the userspace. 4.43 KVM_SET_XSAVE ------------------ -:Capability: KVM_CAP_XSAVE +:Capability: KVM_CAP_XSAVE and KVM_CAP_XSAVE2 :Architectures: x86 :Type: vcpu ioctl :Parameters: struct kvm_xsave (in) @@ -1588,9 +1589,18 @@ This ioctl would copy current vcpu's xsave struct to the userspace. struct kvm_xsave { __u32 region[1024]; + __u32 extra[0]; }; -This ioctl would copy userspace's xsave struct to the kernel. +This ioctl would copy userspace's xsave struct to the kernel. It copies +as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2), +when invoked on the vm file descriptor. The size value returned by +KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096. +Currently, it is only greater than 4096 if a dynamic feature has been +enabled with ``arch_prctl()``, but this may change in the future. + +The offsets of the state save areas in struct kvm_xsave follow the +contents of CPUID leaf 0xD on the host. 4.44 KVM_GET_XCRS @@ -5535,6 +5545,34 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header. The Stats Data block contains an array of 64-bit values in the same order as the descriptors in Descriptors block. +4.42 KVM_GET_XSAVE2 +------------------ + +:Capability: KVM_CAP_XSAVE2 +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xsave (out) +:Returns: 0 on success, -1 on error + + +:: + + struct kvm_xsave { + __u32 region[1024]; + __u32 extra[0]; + }; + +This ioctl would copy current vcpu's xsave struct to the userspace. It +copies as many bytes as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) +when invoked on the vm file descriptor. The size value returned by +KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) will always be at least 4096. +Currently, it is only greater than 4096 if a dynamic feature has been +enabled with ``arch_prctl()``, but this may change in the future. + +The offsets of the state save areas in struct kvm_xsave follow the contents +of CPUID leaf 0xD on the host. + + 5. The kvm_run structure ======================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5a776a08f78c..2da3316bb559 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -373,9 +373,23 @@ struct kvm_debugregs { __u64 reserved[9]; }; -/* for KVM_CAP_XSAVE */ +/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */ struct kvm_xsave { + /* + * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes + * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) + * respectively, when invoked on the vm file descriptor. + * + * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) + * will always be at least 4096. Currently, it is only greater + * than 4096 if a dynamic feature has been enabled with + * ``arch_prctl()``, but this may change in the future. + * + * The offsets of the state save areas in struct kvm_xsave follow + * the contents of CPUID leaf 0xD on the host. + */ __u32 region[1024]; + __u32 extra[0]; }; #define KVM_MAX_XCRS 16 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ba4c3d5d2386..c55e57b30e81 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -32,7 +32,7 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); -static u32 xstate_required_size(u64 xstate_bv, bool compacted) +u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c99edfff7f82..8a770b481d9d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +u32 xstate_required_size(u64 xstate_bv, bool compacted); + int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 993eee6451ea..bde18ca657db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4314,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = 0; break; + case KVM_CAP_XSAVE2: { + u64 guest_perm = xstate_get_guest_group_perm(); + + r = xstate_required_size(supported_xcr0 & guest_perm, false); + if (r < sizeof(struct kvm_xsave)) + r = sizeof(struct kvm_xsave); + break; + } default: break; } @@ -4917,6 +4925,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, vcpu->arch.pkru); } +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) +{ + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; + + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + state, size, vcpu->arch.pkru); +} + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { @@ -5370,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { + r = -EINVAL; + if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) + break; + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) @@ -5384,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = memdup_user(argp, size); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; @@ -5393,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + + case KVM_GET_XSAVE2: { + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, size)) + break; + + r = 0; + break; + } + case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index fbfd70d965c6..9563d294f181 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1132,6 +1132,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_MTE 205 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 +#define KVM_CAP_XSAVE2 208 #ifdef KVM_CAP_IRQ_ROUTING @@ -1622,6 +1623,9 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; -- cgit v1.2.3