diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 27 | ||||
-rw-r--r-- | arch/arm64/kvm/psci.c | 3 | ||||
-rw-r--r-- | arch/riscv/kvm/vcpu_sbi.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 24 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 57 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 15 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 8 | ||||
-rw-r--r-- | include/uapi/linux/kvm.h | 9 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 1 |
11 files changed, 119 insertions, 41 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 72183ae628f7..0c1b9f139e4a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6089,21 +6089,18 @@ should put the acknowledged interrupt vector into the 'epr' field. #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 #define KVM_SYSTEM_EVENT_SEV_TERM 4 - #define KVM_SYSTEM_EVENT_NDATA_VALID (1u << 31) __u32 type; __u32 ndata; - __u64 flags; __u64 data[16]; } system_event; If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered a system-level event using some architecture specific mechanism (hypercall or some special instruction). In case of ARM64, this is triggered using -HVC instruction based PSCI call from the vcpu. The 'type' field describes -the system-level event type. The 'flags' field describes architecture -specific flags for the system-level event. +HVC instruction based PSCI call from the vcpu. -Valid values for bits 30:0 of 'type' are: +The 'type' field describes the system-level event type. +Valid values for 'type' are: - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the VM. Userspace is not obliged to honour this, and if it does honour @@ -6119,14 +6116,20 @@ Valid values for bits 30:0 of 'type' are: - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination. The guest physical address of the guest's GHCB is stored in `data[0]`. -Valid flags are: +If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain +architecture specific information for the system-level event. Only +the first `ndata` items (possibly zero) of the data array are valid. + + - for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if + the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI + specification. - - KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued - a SYSTEM_RESET2 call according to v1.1 of the PSCI specification. + - for RISC-V, data[0] is set to the value of the second argument of the + ``sbi_system_reset`` call. -Extra data for this event is stored in the `data[]` array, up to index -`ndata-1` included, if bit 31 is set in `type`. The data depends on the -`type` field. There is no extra data if bit 31 is clear or `ndata` is zero. +Previous versions of Linux defined a `flags` member in this struct. The +field is now aliased to `data[0]`. Userspace can assume that it is only +written if ndata is greater than 0. :: diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index baac2b405f23..708d80e8e60d 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags) memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); vcpu->run->system_event.type = type; - vcpu->run->system_event.flags = flags; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = flags; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index a09ecb97b890..d45e7da3f0d3 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, struct kvm_run *run, - u32 type, u64 flags) + u32 type, u64 reason) { unsigned long i; struct kvm_vcpu *tmp; @@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, memset(&run->system_event, 0, sizeof(run->system_event)); run->system_event.type = type; - run->system_event.flags = flags; + run->system_event.ndata = 1; + run->system_event.data[0] = reason; run->exit_reason = KVM_EXIT_SYSTEM_EVENT; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e6cae6f22683..a335e7f1f69e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline gfn_t kvm_mmu_max_gfn(void) +{ + /* + * Note that this uses the host MAXPHYADDR, not the guest's. + * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR; + * assuming KVM is running on bare metal, guest accesses beyond + * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit + * (either EPT Violation/Misconfig or #NPF), and so KVM will never + * install a SPTE for such addresses. If KVM is running as a VM + * itself, on the other hand, it might see a MAXPHYADDR that is less + * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR + * disallows such SPTEs entirely and simplifies the TDP MMU. + */ + int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52; + + return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; +} + void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c623019929a7..77785587332e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, const struct kvm_memory_slot *slot) { unsigned long hva; - pte_t *pte; - int level; + unsigned long flags; + int level = PG_LEVEL_4K; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) return PG_LEVEL_4K; @@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, */ hva = __gfn_to_hva_memslot(slot, gfn); - pte = lookup_address_in_mm(kvm->mm, hva, &level); - if (unlikely(!pte)) - return PG_LEVEL_4K; + /* + * Lookup the mapping level in the current mm. The information + * may become stale soon, but it is safe to use as long as + * 1) mmu_notifier_retry was checked after taking mmu_lock, and + * 2) mmu_lock is taken now. + * + * We still need to disable IRQs to prevent concurrent tear down + * of page tables. + */ + local_irq_save(flags); + + pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + + p4d = READ_ONCE(*p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + pud = READ_ONCE(*pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + if (pud_large(pud)) { + level = PG_LEVEL_1G; + goto out; + } + + pmd = READ_ONCE(*pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (pmd_large(pmd)) + level = PG_LEVEL_2M; + +out: + local_irq_restore(flags); return level; } @@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. + * MMIO SPTE will just be an expensive nop. Do not cache MMIO + * whose gfn is greater than host.MAXPHYADDR, any guest that + * generates such gfns is running nested and is being tricked + * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if + * and only if L1's MAXPHYADDR is inaccurate with respect to + * the hardware's). */ - if (unlikely(!shadow_mmio_value)) { + if (unlikely(!shadow_mmio_value) || + unlikely(fault->gfn > kvm_mmu_max_gfn())) { *ret_val = RET_PF_EMULATE; return true; } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 73f12615416f..e4abeb5df1b1 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -extern u8 __read_mostly shadow_phys_bits; - static inline bool is_mmio_spte(u64 spte) { return (spte & shadow_mmio_mask) == shadow_mmio_value && diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c472769e0300..edc68538819b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, return iter->yielded; } -static inline gfn_t tdp_mmu_max_gfn_host(void) +static inline gfn_t tdp_mmu_max_gfn_exclusive(void) { /* - * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that - * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF, - * and so KVM will never install a SPTE for such addresses. + * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with + * a gpa range that would exceed the max gfn, and KVM does not create + * MMIO SPTEs for "impossible" gfns, instead sending such accesses down + * the slow emulation path every time. */ - return 1ULL << (shadow_phys_bits - PAGE_SHIFT); + return kvm_mmu_max_gfn() + 1; } static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, @@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - gfn_t end = tdp_mmu_max_gfn_host(); + gfn_t end = tdp_mmu_max_gfn_exclusive(); gfn_t start = 0; for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { @@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - end = min(end, tdp_mmu_max_gfn_host()); + end = min(end, tdp_mmu_max_gfn_exclusive()); lockdep_assert_held_write(&kvm->mmu_lock); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a93f0d01bb90..9ce162119b23 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2739,10 +2739,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) reason_set, reason_code); vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM | - KVM_SYSTEM_EVENT_NDATA_VALID; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[1] = control->ghcb_gpa; + vcpu->run->system_event.data[0] = control->ghcb_gpa; return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e7f3a8da16a..1bc67995cc8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10056,12 +10056,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } @@ -12059,8 +12061,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *new, enum kvm_mr_change change) { - if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { + if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) + return -EINVAL; + return kvm_alloc_memslot_metadata(kvm, new); + } if (change == KVM_MR_FLAGS_ONLY) memcpy(&new->arch, &old->arch, sizeof(old->arch)); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dd1d8167e71f..e10d131edd80 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -445,10 +445,14 @@ struct kvm_run { #define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_CRASH 3 #define KVM_SYSTEM_EVENT_SEV_TERM 4 -#define KVM_SYSTEM_EVENT_NDATA_VALID (1u << 31) __u32 type; __u32 ndata; - __u64 data[16]; + union { +#ifndef __KERNEL__ + __u64 flags; +#endif + __u64 data[16]; + }; } system_event; /* KVM_EXIT_S390_STSI */ struct { @@ -1148,6 +1152,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_CAPABILITY 212 #define KVM_CAP_DISABLE_QUIRKS2 213 #define KVM_CAP_VM_TSC_CONTROL 214 +#define KVM_CAP_SYSTEM_EVENT_DATA 215 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dfb7dabdbc63..ac57fc2c935f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4333,6 +4333,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return 0; #endif case KVM_CAP_BINARY_STATS_FD: + case KVM_CAP_SYSTEM_EVENT_DATA: return 1; default: break; |